pbs_offline_vnodes.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. class TestOfflineVnode(TestFunctional):
  38. """
  39. Tests if vnodes are marked offline:
  40. - when a hook fails and the hook fail action is 'offline_vnodes'
  41. - using pbsnodes -o
  42. """
  43. is_cray = True
  44. def setUp(self):
  45. if not self.du.get_platform().startswith('cray'):
  46. self.is_cray = False
  47. TestFunctional.setUp(self)
  48. self.server.manager(MGR_CMD_SET, SERVER, {'log_events': 2047})
  49. def create_mom_hook(self):
  50. name = "h1"
  51. body = ("import pbs\n"
  52. "import time\n"
  53. "time.sleep(60)\n"
  54. "pbs.event.accept()")
  55. attr = {'event': 'execjob_begin', 'fail_action': 'offline_vnodes',
  56. 'alarm': '3', 'enabled': 'true'}
  57. self.server.create_import_hook(name, attr, body)
  58. def create_bad_begin_hook(self):
  59. name = "h2"
  60. body = ("import pbs\n"
  61. "e=pbs.event()\n"
  62. "if e.job.in_ms_mom():\n"
  63. " e.accept()\n"
  64. "raise ValueError('invalid name')\n")
  65. attr = {'event': 'execjob_begin', 'fail_action': 'offline_vnodes'}
  66. self.server.create_import_hook(name, attr, body)
  67. def create_bad_startup_hook(self):
  68. name = "h3"
  69. body = ("import pbs\n"
  70. "raise ValueError('invalid name')\n")
  71. attr = {'event': 'exechost_startup', 'fail_action': 'offline_vnodes'}
  72. self.server.create_import_hook(name, attr, body)
  73. def create_multi_vnodes(self, num_moms, num_vnode=3):
  74. if num_moms != len(self.moms):
  75. self.server.manager(MGR_CMD_DELETE, NODE, id="@default",
  76. expect=True)
  77. if self.is_cray is True:
  78. if num_moms == 1 and len(self.moms) != 1:
  79. self.server.manager(MGR_CMD_CREATE, NODE,
  80. id=self.moms.values()[0].shortname)
  81. # adding a sleep of two seconds because it takes some time
  82. # before node resources start showing up
  83. time.sleep(2)
  84. return
  85. # No need to create vnodes on a cpuset mom
  86. if self.moms.values()[0].is_cpuset_mom() is True:
  87. return
  88. vn_attrs = {ATTR_rescavail + '.ncpus': 1,
  89. ATTR_rescavail + '.mem': '1024mb'}
  90. for i in range(num_moms):
  91. self.server.create_vnodes('vnode', vn_attrs, num_vnode,
  92. self.moms.values()[i],
  93. usenatvnode=True, delall=False,
  94. expect=False)
  95. # Calling an explicit expect on newly created nodes.
  96. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  97. id=self.moms.values()[i].shortname)
  98. def verify_vnodes_state(self, expected_state):
  99. """
  100. Verify that the vnodes are set to the expected state
  101. """
  102. vlist = []
  103. if self.is_cray is True:
  104. vnl = self.server.filter(
  105. VNODE, {'resources_available.vntype': 'cray_compute'})
  106. vlist = vnl["resources_available.vntype=cray_compute"]
  107. elif self.moms.values()[0].is_cpuset_mom() is True:
  108. vnl = self.server.status(NODE)
  109. vlist = [x['id'] for x in vnl if x['id'] !=
  110. self.moms.values()[0].shortname]
  111. else:
  112. vlist = ["vnode[0]", "vnode[1]"]
  113. for v1 in vlist:
  114. # Check the vnode state
  115. self.server.expect(
  116. VNODE, {'state': expected_state}, id=v1, interval=2)
  117. return vlist[0]
  118. def tearDown(self):
  119. TestFunctional.tearDown(self)
  120. # Restore original node setup for future test cases.
  121. self.server.cleanup_jobs(extend='force')
  122. self.server.manager(MGR_CMD_DELETE, NODE, id="@default",
  123. expect=True)
  124. for m in self.moms.values():
  125. self.server.manager(MGR_CMD_CREATE, NODE,
  126. id=m.shortname)
  127. def test_single_mom_hook_failure_affects_vnode(self):
  128. """
  129. Run an execjob_begin hook that sleep for sometime,
  130. at the same time set an alarm value so less that
  131. the hook alarms out and server executes the fail_action
  132. After this check if vnodes are marked offline.
  133. In case of a single mom reporting vnodes, it should mark
  134. all the vnodes and mom as offline.
  135. Once offlined, reset the mom by issueing pbsnodes -r
  136. and check if the job runs on one of the vnodes.
  137. """
  138. single_mom = self.moms.values()[0]
  139. start_time = int(time.time())
  140. self.create_multi_vnodes(1)
  141. self.create_mom_hook()
  142. # Check if hook files were copied to mom
  143. single_mom.log_match(
  144. "h1.HK;copy hook-related file request received",
  145. starttime=start_time, interval=2)
  146. single_mom.log_match(
  147. "h1.PY;copy hook-related file request received",
  148. starttime=start_time, interval=2)
  149. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  150. id=single_mom.shortname, interval=2)
  151. j1 = Job(TEST_USER)
  152. j1.set_sleep_time(1000)
  153. jid = self.server.submit(j1)
  154. # mom hook will alarm out and job will get into Q state
  155. self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid)
  156. # since mom hook alarm out it's fail_action will put the
  157. # node in offline state
  158. self.server.expect(
  159. NODE, {ATTR_NODE_state: 'offline'},
  160. id=single_mom.shortname, interval=2)
  161. vname = self.verify_vnodes_state('offline')
  162. mom_host = single_mom.shortname
  163. pbs_exec = self.server.pbs_conf['PBS_EXEC']
  164. pbsnodes_cmd = os.path.join(pbs_exec, 'bin', 'pbsnodes')
  165. pbsnodes_reset = pbsnodes_cmd + ' -r ' + mom_host
  166. # Set mom sync hook timeout to be a low value because if mom fails to
  167. # get the hook after disabling it then next sync will happen after
  168. # 2 minutes by default and we don't want to wait that long.
  169. self.server.manager(MGR_CMD_SET, SERVER,
  170. {'sync_mom_hookfiles_timeout': '5'})
  171. self.server.manager(MGR_CMD_SET, HOOK, {'enabled': 'False'}, id="h1")
  172. # Make sure that hook has been sent to mom
  173. self.server.log_match("successfully sent hook file")
  174. self.du.run_cmd(self.server.hostname, cmd=pbsnodes_reset)
  175. self.server.delete(jid, wait=True)
  176. j2 = Job(TEST_USER)
  177. j2.set_attributes({ATTR_l + '.select': '1:vnode=' + vname})
  178. jid2 = self.server.submit(j2)
  179. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  180. id=single_mom.shortname, interval=2)
  181. self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2)
  182. def test_multi_mom_hook_failure_affects_vnode(self):
  183. """
  184. Run an execjob_begin hook that sleeps for sometime,
  185. at the same time set an alarm value so less that
  186. the hook alarms out and server executes the fail_action
  187. After this check if vnodes are marked offline.
  188. In case of a multiple mom reporting same set of vnodes,
  189. it should not mark the vnodes and mom as offline because
  190. there are other moms active and reporting same vnodes.
  191. NOTE: This test needs moms to report the same set of vnodes
  192. """
  193. if len(self.moms) != 2:
  194. self.skipTest("Provide 2 moms while invoking test")
  195. for m in self.moms.values():
  196. if m.is_cpuset_mom():
  197. self.skipTest("Skipping test on cpuset moms")
  198. # The moms provided to the test may have unwanted vnodedef files.
  199. if self.moms.values()[0].has_vnode_defs():
  200. self.moms.values()[0].delete_vnode_defs()
  201. if self.moms.values()[1].has_vnode_defs():
  202. self.moms.values()[1].delete_vnode_defs()
  203. start_time = int(time.time())
  204. self.create_multi_vnodes(2)
  205. self.create_mom_hook()
  206. # Check if hook files were copied to mom
  207. for m in self.moms.values():
  208. m.log_match(
  209. "h1.HK;copy hook-related file request received",
  210. starttime=start_time, interval=2)
  211. m.log_match(
  212. "h1.PY;copy hook-related file request received",
  213. starttime=start_time, interval=2)
  214. # set one natural node to have higher ncpus than the other one so
  215. # that the job only goes to this natural node.
  216. self.server.manager(MGR_CMD_SET, NODE, {
  217. ATTR_rescavail + '.ncpus': '256'},
  218. id=self.moms.values()[0].shortname)
  219. self.server.manager(MGR_CMD_SET, NODE, {
  220. ATTR_rescavail + '.ncpus': '1'},
  221. id=self.moms.values()[1].shortname)
  222. j1 = Job(TEST_USER)
  223. if self.is_cray is True:
  224. # on a cray, make sure job runs on login node with higher number of
  225. # ncpus
  226. j1.set_attributes(
  227. {ATTR_l + '.select': '1:ncpus=256:vntype=cray_login'})
  228. else:
  229. j1.set_attributes({ATTR_l + '.select': '1:ncpus=256'})
  230. jid = self.server.submit(j1)
  231. self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid)
  232. self.server.expect(NODE, {ATTR_NODE_state: 'offline'},
  233. id=self.moms.values()[0].shortname, interval=2)
  234. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  235. id=self.moms.values()[1].shortname, interval=2)
  236. self.verify_vnodes_state('free')
  237. def test_multi_mom_hook_failure_affects_vnode2(self):
  238. """
  239. Run an execjob_begin hook that gets an exception
  240. when executed by sister mom, causing
  241. the server to execute the fail_action=offline_vnodes, which
  242. result in sister vnode to be marked offline.
  243. """
  244. if len(self.moms) != 2:
  245. self.skipTest("Provide 2 moms while invoking test")
  246. for m in self.moms.values():
  247. if m.is_cpuset_mom():
  248. self.skipTest("Skipping test on cpuset moms")
  249. if self.is_cray is True:
  250. self.skipTest("Skipping test on Crays")
  251. # The moms provided to the test may have unwanted vnodedef files.
  252. if self.moms.values()[0].has_vnode_defs():
  253. self.moms.values()[0].delete_vnode_defs()
  254. if self.moms.values()[1].has_vnode_defs():
  255. self.moms.values()[1].delete_vnode_defs()
  256. start_time = int(time.time())
  257. self.create_multi_vnodes(num_moms=2, num_vnode=1)
  258. self.create_bad_begin_hook()
  259. # Check if hook files were copied to mom
  260. for m in self.moms.values():
  261. m.log_match(
  262. "h2.HK;copy hook-related file request received",
  263. starttime=start_time, interval=2)
  264. m.log_match(
  265. "h2.PY;copy hook-related file request received",
  266. starttime=start_time, interval=2)
  267. j1 = Job(TEST_USER)
  268. a = {ATTR_l + '.select': '2:ncpus=1',
  269. ATTR_l + '.place': 'scatter'}
  270. j1.set_attributes(a)
  271. jid = self.server.submit(j1)
  272. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  273. id=self.moms.values()[0].shortname, interval=2)
  274. # sister mom's vnode gets offlined due to hook exception
  275. self.server.expect(NODE,
  276. {ATTR_NODE_state: 'offline',
  277. ATTR_comment:
  278. "offlined by hook 'h2' due to hook error"},
  279. id=self.moms.values()[1].shortname,
  280. interval=2, attrop=PTL_AND)
  281. self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid)
  282. def test_fail_action_startup_hook(self):
  283. """
  284. Run an exechost_startup hook that gets an
  285. exception when local mom is restarted. Vnode representing
  286. local mom would be marked offline.
  287. """
  288. mom = self.moms.values()[0]
  289. if mom.is_cpuset_mom():
  290. self.skipTest("Skipping test on cpuset moms")
  291. if self.is_cray is True:
  292. self.skipTest("Skipping test on Crays")
  293. # The moms provided to the test may have unwanted vnodedef files.
  294. if mom.has_vnode_defs():
  295. mom.delete_vnode_defs()
  296. start_time = int(time.time())
  297. self.create_multi_vnodes(1)
  298. self.create_bad_startup_hook()
  299. # Check if hook files were copied to mom
  300. mom.log_match(
  301. "h3.HK;copy hook-related file request received",
  302. starttime=start_time, interval=2)
  303. mom.log_match(
  304. "h3.PY;copy hook-related file request received",
  305. starttime=start_time, interval=2)
  306. mom.stop()
  307. mom.start()
  308. # primary mom's vnode gets offlined due to startup hook exception
  309. self.server.expect(NODE,
  310. {ATTR_NODE_state: 'offline',
  311. ATTR_comment:
  312. "offlined by hook 'h3' due to hook error"},
  313. id=mom.shortname,
  314. interval=2, attrop=PTL_AND)
  315. def test_pbsnodes_o_single_mom(self):
  316. """
  317. Offline a mom using pbsnodes -o.
  318. Since it is the only mom, all vnodes reported by her
  319. should also be offline.
  320. """
  321. single_mom = self.moms.values()[0]
  322. self.create_multi_vnodes(1)
  323. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  324. id=single_mom.shortname, interval=2)
  325. mom_host = single_mom.shortname
  326. pbs_exec = self.server.pbs_conf['PBS_EXEC']
  327. pbsnodes_cmd = os.path.join(pbs_exec, 'bin', 'pbsnodes')
  328. pbsnodes_offline = [pbsnodes_cmd, '-o', mom_host]
  329. self.du.run_cmd(self.server.hostname, cmd=pbsnodes_offline)
  330. # the mom node and all of her children should be offline
  331. self.server.expect(
  332. NODE, {ATTR_NODE_state: 'offline'},
  333. id=single_mom.shortname, interval=2)
  334. self.verify_vnodes_state('offline')
  335. def test_pbsnodes_o_multi_mom_only_one_offline(self):
  336. """
  337. Offline one mom using pbsnodes -o.
  338. In the case of multiple moms reporting the same set of vnodes,
  339. none of the vnodes should be marked offline,
  340. including the children vnodes.
  341. NOTE: This test needs moms to report the same set of vnodes.
  342. """
  343. if len(self.moms) != 2:
  344. self.skipTest("Provide 2 moms while invoking test")
  345. for m in self.moms.values():
  346. if m.is_cpuset_mom():
  347. self.skipTest("Skipping test on cpuset moms")
  348. momA = self.moms.values()[0]
  349. momB = self.moms.values()[1]
  350. # The moms provided to the test may have unwanted vnodedef files.
  351. if momA.has_vnode_defs():
  352. momA.delete_vnode_defs()
  353. if momB.has_vnode_defs():
  354. momB.delete_vnode_defs()
  355. self.create_multi_vnodes(2)
  356. # Offline only one of the moms, the other mom and her children
  357. # should still be free
  358. pbs_exec = self.server.pbs_conf['PBS_EXEC']
  359. pbsnodes_cmd = os.path.join(pbs_exec, 'bin', 'pbsnodes')
  360. pbsnodes_offline = [pbsnodes_cmd, '-o', momA.shortname]
  361. self.du.run_cmd(self.server.hostname, cmd=pbsnodes_offline)
  362. # MomA should be offline
  363. self.server.expect(
  364. NODE, {ATTR_NODE_state: 'offline'},
  365. id=momA.shortname, interval=2)
  366. # momB and the rest of the vnodes should be free
  367. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  368. id=momB.shortname, interval=2)
  369. self.verify_vnodes_state('free')
  370. def test_pbsnodes_multi_mom_offline_online(self):
  371. """
  372. When all of the moms reporting a vnode are offline,
  373. the vnode should also be marked offline.
  374. And when pbsnodes -r is used to clear the offline from at
  375. least one of the moms reporting a vnode, then that vnode
  376. should also get the offline cleared.
  377. Note: This test needs moms to report the same set of vnodes.
  378. """
  379. if len(self.moms) != 2:
  380. self.skipTest("Provide 2 moms while invoking test")
  381. for m in self.moms.values():
  382. if m.is_cpuset_mom():
  383. self.skipTest("Skipping test on cpuset moms")
  384. momA = self.moms.values()[0]
  385. momB = self.moms.values()[1]
  386. # The moms provided to the test may have unwanted vnodedef files.
  387. if momA.has_vnode_defs():
  388. momA.delete_vnode_defs()
  389. if momB.has_vnode_defs():
  390. momB.delete_vnode_defs()
  391. self.create_multi_vnodes(2)
  392. # Offline both of the moms, the vnodes reported by them
  393. # will also be offlined
  394. pbs_exec = self.server.pbs_conf['PBS_EXEC']
  395. pbsnodes_cmd = os.path.join(pbs_exec, 'bin', 'pbsnodes')
  396. pbsnodes_offline = [pbsnodes_cmd, '-o', momA.shortname, momB.shortname]
  397. self.du.run_cmd(self.server.hostname, cmd=pbsnodes_offline)
  398. # MomA and MomB should be offline
  399. self.server.expect(
  400. NODE, {ATTR_NODE_state: 'offline'},
  401. id=momA.shortname, interval=2)
  402. self.server.expect(
  403. NODE, {ATTR_NODE_state: 'offline'},
  404. id=momB.shortname, interval=2)
  405. self.verify_vnodes_state('offline')
  406. # Now call pbsnodes -r to clear the offline from MomA
  407. pbsnodes_clear_offline = [pbsnodes_cmd, '-r', momA.shortname]
  408. self.du.run_cmd(self.server.hostname, cmd=pbsnodes_clear_offline)
  409. # MomB should still be offline
  410. self.server.expect(
  411. NODE, {ATTR_NODE_state: 'offline'},
  412. id=momB.shortname, interval=2)
  413. # momA and the vnodes she reports should be free
  414. self.server.expect(NODE, {ATTR_NODE_state: 'free'},
  415. id=momA.shortname, interval=2)
  416. self.verify_vnodes_state('free')