pbs_node_buckets.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. class TestNodeBuckets(TestFunctional):
  38. """
  39. Test basic functionality of node buckets.
  40. """
  41. def setUp(self):
  42. TestFunctional.setUp(self)
  43. day = time.strftime("%Y%m%d", time.localtime(time.time()))
  44. filename = os.path.join(self.server.pbs_conf['PBS_HOME'],
  45. 'sched_logs', day)
  46. self.du.rm(path=filename, force=True, sudo=True, level=logging.DEBUG2)
  47. self.colors = \
  48. ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'violet']
  49. self.shapes = ['circle', 'square', 'triangle',
  50. 'diamond', 'pyramid', 'sphere', 'cube']
  51. self.letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
  52. self.server.manager(MGR_CMD_CREATE, RSC,
  53. {'type': 'string', 'flag': 'h'}, id='color')
  54. self.server.manager(MGR_CMD_CREATE, RSC,
  55. {'type': 'string_array', 'flag': 'h'}, id='shape')
  56. self.server.manager(MGR_CMD_CREATE, RSC,
  57. {'type': 'string_array', 'flag': 'h'}, id='letter')
  58. self.server.manager(MGR_CMD_CREATE, RSC,
  59. {'type': 'boolean', 'flag': 'h'}, id='bool')
  60. a = {'resources_available.ncpus': 2, 'resources_available.mem': '8gb'}
  61. # 10010 nodes since it divides into 7 evenly.
  62. # Each node bucket will have 1430 nodes in it
  63. self.server.create_vnodes(name='vnode', attrib=a, num=10010,
  64. mom=self.mom, sharednode=False,
  65. expect=False, attrfunc=self.cust_attr_func)
  66. # Make sure all the nodes are in state free. We can't let
  67. # create_vnodes() do this because it does a pbsnodes -v on each vnode.
  68. # This takes a long time.
  69. self.server.expect(NODE, {'state=free': (GE, 10010)})
  70. self.scheduler.add_resource('color')
  71. self.scheduler.set_sched_config({'log_filter': '2048'})
  72. def cust_attr_func(self, name, totalnodes, numnode, attribs):
  73. """
  74. Add resources to vnodes. There are 10010 nodes, which means 1430
  75. nodes of each color, letter, and shape. The value of bool is True
  76. for the last 5005 nodes and unset for the first 5005 nodes
  77. """
  78. a = {'resources_available.color': self.colors[numnode / 1430],
  79. 'resources_available.shape': self.shapes[numnode % 7],
  80. 'resources_available.letter': self.letters[numnode % 7]}
  81. if numnode / 5005 == 0:
  82. a['resources_available.bool'] = 'True'
  83. # Yellow buckets get a higher priority
  84. if numnode / 1430 == 2:
  85. a['Priority'] = 100
  86. return dict(attribs.items() + a.items())
  87. def check_normal_path(self, sel='2:ncpus=2:mem=1gb', pl='scatter:excl',
  88. queue='workq'):
  89. """
  90. Check if a job runs in the normal code path
  91. """
  92. a = {'Resource_List.select': sel, 'Resource_List.place': pl,
  93. 'queue': queue}
  94. j = Job(TEST_USER, attrs=a)
  95. jid = self.server.submit(j)
  96. self.scheduler.log_match(jid + ';Evaluating subchunk', n=10000)
  97. self.server.delete(jid, wait=True)
  98. @timeout(450)
  99. def test_basic(self):
  100. """
  101. Request nodes of a specific color and make sure they are correctly
  102. allocated to the job
  103. """
  104. chunk = '4:ncpus=1:color=yellow'
  105. a = {'Resource_List.select': chunk,
  106. 'Resource_List.place': 'scatter:excl'}
  107. J = Job(TEST_USER, a)
  108. jid = self.server.submit(J)
  109. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  110. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  111. js = self.server.status(JOB, id=jid)
  112. nodes = J.get_vnodes(js[0]['exec_vnode'])
  113. for node in nodes:
  114. n = self.server.status(NODE, 'resources_available.color', id=node)
  115. self.assertTrue('yellow' in
  116. n[0]['resources_available.color'])
  117. @timeout(450)
  118. def test_multi_bucket(self):
  119. """
  120. Request two different chunk types which need to be allocated from
  121. different buckets and make sure they are allocated correctly.
  122. """
  123. a = {'Resource_List.select':
  124. '4:ncpus=1:color=yellow+4:ncpus=1:color=blue',
  125. 'Resource_List.place': 'scatter:excl'}
  126. J = Job(TEST_USER, a)
  127. jid = self.server.submit(J)
  128. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  129. self.scheduler.log_match(jid + ';Chunk: ', n=10000)
  130. js = self.server.status(JOB, id=jid)
  131. nodes = J.get_vnodes(js[0]['exec_vnode'])
  132. # Yellow nodes were requested first.
  133. # Make sure they come before the blue nodes.
  134. for i in range(4):
  135. n = self.server.status(NODE, id=nodes[i])
  136. self.assertTrue('yellow' in n[0]['resources_available.color'])
  137. for i in range(4, 8):
  138. n = self.server.status(NODE, id=nodes[i])
  139. self.assertTrue('blue' in n[0]['resources_available.color'])
  140. @timeout(450)
  141. def test_multi_bucket2(self):
  142. """
  143. Request nodes from all 7 different buckets and see them allocated
  144. correctly
  145. """
  146. select = ""
  147. for c in self.colors:
  148. select += "1:ncpus=1:color=%s+" % (c)
  149. # remove the trailing '+'
  150. select = select[:-1]
  151. a = {'Resource_List.select': select,
  152. 'Resource_List.place': 'scatter:excl'}
  153. J = Job(TEST_USER, a)
  154. jid = self.server.submit(J)
  155. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  156. self.scheduler.log_match(jid + ';Chunk:', n=10000)
  157. js = self.server.status(JOB, id=jid)
  158. nodes = J.get_vnodes(js[0]['exec_vnode'])
  159. for i, node in enumerate(nodes):
  160. n = self.server.status(NODE, id=node)
  161. self.assertTrue(self.colors[i] in
  162. n[0]['resources_available.color'])
  163. @timeout(450)
  164. def test_not_run(self):
  165. """
  166. Request more nodes of one color that is available to make sure
  167. the job is not run on incorrect nodes.
  168. """
  169. chunk = '1431:ncpus=1:color=yellow'
  170. a = {'Resource_List.select': chunk,
  171. 'Resource_List.place': 'scatter:excl'}
  172. J = Job(TEST_USER, a)
  173. jid = self.server.submit(J)
  174. a = {'comment': (MATCH_RE, '^Can Never Run'),
  175. 'job_state': 'Q'}
  176. self.server.expect(JOB, a, attrop=PTL_AND, id=jid)
  177. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  178. @timeout(450)
  179. def test_calendaring1(self):
  180. """
  181. Test to see that nodes that are used in the future for
  182. calendared jobs are not used for filler jobs that would
  183. distrupt the scheduled time.
  184. """
  185. self.scheduler.set_sched_config({'strict_ordering': 'True'})
  186. chunk1 = '1:ncpus=1'
  187. a = {'Resource_List.select': chunk1,
  188. 'Resource_List.place': 'scatter:excl',
  189. 'Resource_List.walltime': '1:00:00'}
  190. j = Job(TEST_USER, attrs=a)
  191. jid1 = self.server.submit(j)
  192. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  193. self.scheduler.log_match(jid1 + ';Chunk: ' + chunk1, n=10000)
  194. chunk2 = '10010:ncpus=1'
  195. a = {'Resource_List.select': chunk2,
  196. 'Resource_List.place': 'scatter:excl',
  197. 'Resource_List.walltime': '2:00:00'}
  198. j = Job(TEST_USER, attrs=a)
  199. jid2 = self.server.submit(j)
  200. self.server.expect(JOB, 'comment', op=SET, id=jid2)
  201. self.server.expect(JOB, {'job_state': 'Q'}, id=jid2)
  202. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk2, n=10000)
  203. chunk3 = '2:ncpus=1'
  204. a = {'Resource_List.select': chunk3,
  205. 'Resource_List.place': 'scatter:excl',
  206. 'Resource_List.walltime': '30:00'}
  207. j = Job(TEST_USER, attrs=a)
  208. jid3 = self.server.submit(j)
  209. self.server.expect(JOB, {'job_state': 'R'}, id=jid3)
  210. self.scheduler.log_match(jid3 + ';Chunk: ' + chunk3, n=10000)
  211. a = {'Resource_List.select': chunk3,
  212. 'Resource_List.place': 'scatter:excl',
  213. 'Resource_List.walltime': '2:30:00'}
  214. j = Job(TEST_USER, attrs=a)
  215. jid4 = self.server.submit(j)
  216. self.server.expect(JOB, 'comment', op=SET, id=jid4)
  217. self.server.expect(JOB, {'job_state': 'Q'}, id=jid4)
  218. self.scheduler.log_match(jid4 + ';Chunk: ' + chunk3, n=10000)
  219. @timeout(450)
  220. def test_calendaring2(self):
  221. """
  222. Test that nodes that a reservation calendared on them later on
  223. are used before totally free nodes
  224. """
  225. self.scheduler.set_sched_config({'strict_ordering': 'True'})
  226. now = int(time.time())
  227. a = {'Resource_List.select': '1:vnode=vnode[2865]+1:vnode=vnode[2870]',
  228. 'Resource_List.place': 'scatter:excl',
  229. 'Resource_List.walltime': '1:00:00',
  230. 'reserve_start': now + 3600, 'reserve_end': now + 7200}
  231. r = Reservation(TEST_USER, attrs=a)
  232. rid = self.server.submit(r)
  233. self.server.expect(RESV, {'reserve_state':
  234. (MATCH_RE, 'RESV_CONFIRMED|2')}, id=rid)
  235. chunk = '2:ncpus=1:color=yellow'
  236. a = {'Resource_List.select': chunk,
  237. 'Resource_List.place': 'scatter:excl',
  238. 'Resource_List.walltime': '30:00'}
  239. j = Job(TEST_USER, attrs=a)
  240. jid = self.server.submit(j)
  241. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  242. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  243. s = self.server.status(JOB, 'exec_vnode', id=jid)
  244. n = j.get_vnodes(s[0]['exec_vnode'])
  245. msg = 'busy_later nodes not chosen first'
  246. self.assertTrue('vnode[2865]' in n, msg)
  247. self.assertTrue('vnode[2870]' in n, msg)
  248. @timeout(450)
  249. def test_calendaring3(self):
  250. """
  251. Test that a future reservation's nodes are used first for a job
  252. that is put into the calendar.
  253. """
  254. self.scheduler.set_sched_config({'strict_ordering': 'True'})
  255. now = int(time.time())
  256. a = {'Resource_List.select': '1:vnode=vnode[2865]+1:vnode=vnode[2870]',
  257. 'Resource_List.place': 'scatter:excl',
  258. 'Resource_List.walltime': '1:00:00',
  259. 'reserve_start': now + 3600, 'reserve_end': now + 7200}
  260. r = Reservation(TEST_USER, attrs=a)
  261. rid = self.server.submit(r)
  262. self.server.expect(RESV, {'reserve_state':
  263. (MATCH_RE, 'RESV_CONFIRMED|2')}, id=rid)
  264. chunk1 = '1430:ncpus=1:color=yellow'
  265. a = {'Resource_List.select': chunk1,
  266. 'Resource_List.place': 'scatter:excl',
  267. 'Resource_List.walltime': '30:00'}
  268. j = Job(TEST_USER, attrs=a)
  269. jid = self.server.submit(j)
  270. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  271. self.scheduler.log_match(jid + ';Chunk: ' + chunk1, n=10000)
  272. chunk2 = '2:ncpus=1:color=yellow'
  273. a = {'Resource_List.select': chunk2,
  274. 'Resource_List.place': 'scatter:excl',
  275. 'Resource_List.walltime': '15:00'}
  276. j2 = Job(TEST_USER, attrs=a)
  277. jid2 = self.server.submit(j2)
  278. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk2, n=10000)
  279. self.server.expect(JOB, 'estimated.exec_vnode', op=SET, id=jid2)
  280. s = self.server.status(JOB, 'estimated.exec_vnode', id=jid2)
  281. n = j2.get_vnodes(s[0]['estimated.exec_vnode'])
  282. msg = 'busy_later nodes not chosen first'
  283. self.assertTrue('vnode[2865]' in n, msg)
  284. self.assertTrue('vnode[2870]' in n, msg)
  285. @timeout(450)
  286. def test_buckets_and_non(self):
  287. """
  288. Test that jobs requesting buckets and not requesting buckets
  289. play nice together
  290. """
  291. # vnode[1435] is orange
  292. a = {'Resource_List.ncpus': 1,
  293. 'Resource_List.vnode': 'vnode[1435]'}
  294. j1 = Job(TEST_USER, attrs=a)
  295. jid1 = self.server.submit(j1)
  296. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  297. self.scheduler.log_match(jid1 + ';Evaluating subchunk', n=10000)
  298. chunk = '1429:ncpus=1:color=orange'
  299. a = {'Resource_List.select': chunk,
  300. 'Resource_List.place': 'scatter:excl'}
  301. j2 = Job(TEST_USER, attrs=a)
  302. jid2 = self.server.submit(j2)
  303. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  304. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk, n=10000)
  305. s1 = self.server.status(JOB, 'exec_vnode', id=jid1)
  306. s2 = self.server.status(JOB, 'exec_vnode', id=jid2)
  307. nodes1 = j1.get_vnodes(s1[0]['exec_vnode'])
  308. nodes2 = j2.get_vnodes(s2[0]['exec_vnode'])
  309. msg = 'Job 1 and Job 2 are sharing nodes'
  310. for n in nodes2:
  311. self.assertNotEqual(n, nodes1[0], msg)
  312. @timeout(600)
  313. def test_not_buckets(self):
  314. """
  315. Test to make sure the jobs that should use the standard node searching
  316. code path do not use the bucket code path
  317. """
  318. # Running a 10010 cpu job through the normal code path spams the log.
  319. # We don't care about it, so there is no reason to increase
  320. # the log size by so much.
  321. self.scheduler.set_sched_config({'log_filter': '3328'})
  322. # Run a job on all nodes leaving 1 cpus available on each node
  323. j = Job(TEST_USER, {'Resource_List.select': '10010:ncpus=1',
  324. 'Resource_List.place': 'scatter'})
  325. j.set_sleep_time(600)
  326. jid = self.server.submit(j)
  327. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  328. self.scheduler.set_sched_config({'log_filter': '2048'})
  329. # Node sorting via unused resources uses the standard code path
  330. self.logger.info('Test node_sort_key with unused resources')
  331. a = {'node_sort_key': '\"ncpus HIGH unused\"'}
  332. self.scheduler.set_sched_config(a)
  333. self.check_normal_path()
  334. self.scheduler.revert_to_defaults()
  335. schd_attr = {'log_filter': '2048'}
  336. self.scheduler.set_sched_config(schd_attr)
  337. # provisioning_policy: avoid_provisioning uses the standard code path
  338. self.logger.info('Test avoid_provision')
  339. a = {'provision_policy': 'avoid_provision'}
  340. self.scheduler.set_sched_config(a)
  341. self.check_normal_path()
  342. self.scheduler.revert_to_defaults()
  343. self.scheduler.add_resource('color')
  344. self.scheduler.set_sched_config(schd_attr)
  345. # the bucket codepath requires excl
  346. self.logger.info('Test different place specs')
  347. self.check_normal_path(pl='scatter:shared')
  348. self.check_normal_path(pl='free')
  349. # can't request host or vnode resources on the bucket codepath
  350. self.logger.info('Test jobs requesting host and vnode')
  351. self.check_normal_path(sel='1:ncpus=2:host=vnode[0]')
  352. self.check_normal_path(sel='1:ncpus=2:vnode=vnode[0]')
  353. # suspended jobs use the normal codepath
  354. self.logger.info('Test suspended job')
  355. a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True',
  356. 'priority': 200}
  357. self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='expressq')
  358. self.server.delete(jid, wait=True)
  359. a = {'Resource_List.select': '1430:ncpus=1:color=orange',
  360. 'Resource_List.place': 'scatter:excl'}
  361. j2 = Job(TEST_USER, a)
  362. jid2 = self.server.submit(j2)
  363. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  364. a = {'Resource_List.select': '1:ncpus=1:color=orange',
  365. 'queue': 'expressq'}
  366. j3 = Job(TEST_USER, a)
  367. jid3 = self.server.submit(j3)
  368. self.server.expect(JOB, {'job_state': 'S'}, id=jid2)
  369. self.server.expect(JOB, {'job_state': 'R'}, id=jid3)
  370. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  371. self.scheduler.log_match(jid3 + ';Evaluating subchunk', n=10000)
  372. self.server.delete([jid2, jid3], wait=True)
  373. # Checkpointed jobs use normal code path
  374. self.logger.info('Test checkpointed job')
  375. chk_script = """#!/bin/bash
  376. kill $1
  377. exit 0
  378. """
  379. self.chk_file = self.du.create_temp_file(body=chk_script)
  380. self.du.chmod(path=self.chk_file, mode=0o755)
  381. self.du.chown(path=self.chk_file, uid=0, gid=0, sudo=True)
  382. c = {'$action': 'checkpoint_abort 30 !' + self.chk_file + ' %sid'}
  383. self.mom.add_config(c)
  384. self.scheduler.set_sched_config({'preempt_order': 'C'})
  385. attrs = {'Resource_List.select': '1430:ncpus=1:color=orange',
  386. 'Resource_List.place': 'scatter:excl'}
  387. j_c1 = Job(TEST_USER, attrs)
  388. jid_c1 = self.server.submit(j_c1)
  389. self.server.expect(JOB, {'job_state': 'R'}, id=jid_c1)
  390. self.scheduler.log_match(
  391. jid_c1 + ';Chunk: 1430:ncpus=1:color=orange', n=10000)
  392. a = {'Resource_List.select': '1:ncpus=1:color=orange',
  393. 'queue': 'expressq'}
  394. j_c2 = Job(TEST_USER, a)
  395. jid_c2 = self.server.submit(j_c2)
  396. self.server.expect(JOB, {'job_state': 'Q'}, id=jid_c1)
  397. self.server.expect(JOB, {'job_state': 'R'}, id=jid_c2)
  398. self.scheduler.log_match(
  399. jid_c1 + ";Job preempted by checkpointing", n=10000)
  400. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  401. self.scheduler.log_match(jid_c2 + ';Evaluating subchunk', n=10000)
  402. self.server.delete([jid_c1, jid_c2], wait=True)
  403. # Job's in reservations use the standard codepath
  404. self.logger.info('Test job in reservation')
  405. now = int(time.time())
  406. a = {'Resource_List.select': '4:ncpus=2:mem=4gb',
  407. 'Resource_List.place': 'scatter:excl',
  408. 'reserve_start': now + 30, 'reserve_end': now + 120}
  409. r = Reservation(TEST_USER, a)
  410. rid = self.server.submit(r)
  411. self.server.expect(RESV,
  412. {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')})
  413. self.logger.info('Waiting 30s for reservation to start')
  414. self.server.expect(RESV,
  415. {'reserve_state': (MATCH_RE, 'RESV_RUNNING|5')},
  416. offset=30)
  417. r_queue = rid.split('.')[0]
  418. self.check_normal_path(sel='1:ncpus=3', queue=r_queue)
  419. self.server.delete(rid)
  420. # Jobs on multi-vnoded systems use the standard codepath
  421. self.logger.info('Test job on multi-vnoded system')
  422. a = {'resources_available.ncpus': 2, 'resources_available.mem': '8gb'}
  423. self.server.create_vnodes('vnode', a, 8, self.mom,
  424. sharednode=False, vnodes_per_host=4)
  425. self.check_normal_path(sel='2:ncpus=8')
  426. @timeout(450)
  427. def test_multi_vnode_resv(self):
  428. """
  429. Test that node buckets do not get in the way of running jobs on
  430. multi-vnoded systems in reservations
  431. """
  432. a = {'resources_available.ncpus': 2, 'resources_available.mem': '8gb'}
  433. self.server.create_vnodes('vnode', a, 12, self.mom,
  434. sharednode=False, vnodes_per_host=4,
  435. attrfunc=self.cust_attr_func)
  436. now = int(time.time())
  437. a = {'Resource_List.select': '8:ncpus=1',
  438. 'Resource_List.place': 'vscatter',
  439. 'reserve_start': now + 30,
  440. 'reserve_end': now + 3600}
  441. r = Reservation(TEST_USER, attrs=a)
  442. rid = self.server.submit(r)
  443. a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')}
  444. self.server.expect(RESV, a, id=rid)
  445. self.logger.info('Waiting 30s for reservation to start')
  446. a['reserve_state'] = (MATCH_RE, 'RESV_RUNNING|5')
  447. self.server.expect(RESV, a, id=rid, offset=30)
  448. a = {'Resource_List.select': '2:ncpus=1',
  449. 'Resource_List.place': 'group=shape',
  450. 'queue': rid.split('.')[0]}
  451. j = Job(TEST_USER, attrs=a)
  452. jid = self.server.submit(j)
  453. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  454. self.scheduler.log_match(jid + ';Evaluating subchunk', n=10000)
  455. ev = self.server.status(JOB, 'exec_vnode', id=jid)
  456. used_nodes = j.get_vnodes(ev[0]['exec_vnode'])
  457. n = self.server.status(NODE, 'resources_available.shape')
  458. s = [x['resources_available.shape']
  459. for x in n if x['id'] in used_nodes]
  460. self.assertEqual(len(set(s)), 1,
  461. "Job1 ran in more than one placement set")
  462. @timeout(450)
  463. def test_bucket_sort(self):
  464. """
  465. Test if buckets are sorted properly: all of the yellow bucket
  466. also has priority 100. It should be the first bucket.
  467. """
  468. a = {'node_sort_key': '\"sort_priority HIGH\"'}
  469. self.scheduler.set_sched_config(a)
  470. chunk = '2:ncpus=1'
  471. j = Job(TEST_USER, {'Resource_List.select': chunk,
  472. 'Resource_List.place': 'scatter:excl'})
  473. jid = self.server.submit(j)
  474. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  475. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  476. jobs = self.server.status(JOB, {'exec_vnode'})
  477. jn = j.get_vnodes(jobs[0]['exec_vnode'])
  478. n1 = self.server.status(NODE, 'resources_available.color',
  479. id=jn[0])
  480. n2 = self.server.status(NODE, 'resources_available.color',
  481. id=jn[1])
  482. c1 = n1[0]['resources_available.color']
  483. c2 = n2[0]['resources_available.color']
  484. self.assertEquals(c1, 'yellow', "Job didn't run on yellow nodes")
  485. self.assertEquals(c2, 'yellow', "Job didn't run on yellow nodes")
  486. @timeout(450)
  487. def test_psets(self):
  488. """
  489. Test placement sets with node buckets
  490. """
  491. a = {'node_group_key': 'shape', 'node_group_enable': 'True',
  492. 'scheduling': 'False'}
  493. self.server.manager(MGR_CMD_SET, SERVER, a, expect=True)
  494. chunk = '1430:ncpus=1'
  495. a = {'Resource_List.select': chunk,
  496. 'Resource_List.place': 'scatter:excl'}
  497. j1 = Job(TEST_USER, attrs=a)
  498. jid1 = self.server.submit(j1)
  499. j2 = Job(TEST_USER, attrs=a)
  500. jid2 = self.server.submit(j2)
  501. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  502. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  503. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  504. self.scheduler.log_match(jid1 + ';Chunk: ' + chunk, n=10000)
  505. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk, n=10000)
  506. ev = self.server.status(JOB, 'exec_vnode', id=jid1)
  507. used_nodes1 = j1.get_vnodes(ev[0]['exec_vnode'])
  508. n = self.server.status(NODE, 'resources_available.shape')
  509. s = [x['resources_available.shape']
  510. for x in n if x['id'] in used_nodes1]
  511. self.assertEqual(len(set(s)), 1,
  512. "Job1 ran in more than one placement set")
  513. ev = self.server.status(JOB, 'exec_vnode', id=jid2)
  514. used_nodes2 = j2.get_vnodes(ev[0]['exec_vnode'])
  515. s = [x['resources_available.shape']
  516. for x in n if x['id'] in used_nodes2]
  517. self.assertEqual(len(set(s)), 1,
  518. "Job2 ran in more than one placement set")
  519. for node in used_nodes1:
  520. self.assertNotIn(node, used_nodes2, 'Jobs share nodes: ' + node)
  521. @timeout(450)
  522. def test_psets_calendaring(self):
  523. """
  524. Test that jobs in the calendar fit within a placement set
  525. """
  526. self.scheduler.set_sched_config({'strict_ordering': 'True'})
  527. self.server.manager(MGR_CMD_SET, SERVER, {'backfill_depth': 5})
  528. chunk1 = '10010:ncpus=1'
  529. a = {'Resource_List.select': chunk1,
  530. 'Resource_List.place': 'scatter:excl',
  531. 'Resource_List.walltime': '1:00:00'}
  532. j1 = Job(TEST_USER, attrs=a)
  533. jid1 = self.server.submit(j1)
  534. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  535. self.scheduler.log_match(jid1 + ';Chunk: ' + chunk1, n=10000)
  536. svr_attr = {'node_group_key': 'shape', 'node_group_enable': 'True'}
  537. self.server.manager(MGR_CMD_SET, SERVER, svr_attr)
  538. chunk2 = '1430:ncpus=1'
  539. a['Resource_List.select'] = chunk2
  540. j2 = Job(TEST_USER, a)
  541. jid2 = self.server.submit(j2)
  542. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk2, n=10000)
  543. self.scheduler.log_match(jid2 + ';Job is a top job', n=10000)
  544. n = self.server.status(NODE, 'resources_available.shape')
  545. ev = self.server.status(JOB, 'estimated.exec_vnode', id=jid2)
  546. used_nodes2 = j2.get_vnodes(ev[0]['estimated.exec_vnode'])
  547. s = [x['resources_available.shape']
  548. for x in n if x['id'] in used_nodes2]
  549. self.assertEqual(len(set(s)), 1,
  550. "Job1 will run in more than one placement set")
  551. j3 = Job(TEST_USER, a)
  552. jid3 = self.server.submit(j3)
  553. self.scheduler.log_match(jid3 + ';Chunk: ' + chunk2, n=10000)
  554. self.scheduler.log_match(jid3 + ';Job is a top job', n=10000)
  555. ev = self.server.status(JOB, 'estimated.exec_vnode', id=jid3)
  556. used_nodes3 = j3.get_vnodes(ev[0]['estimated.exec_vnode'])
  557. s = [x['resources_available.shape']
  558. for x in n if x['id'] in used_nodes3]
  559. self.assertEqual(len(set(s)), 1,
  560. "Job1 will run in more than one placement set")
  561. for node in used_nodes2:
  562. self.assertNotIn(node, used_nodes3,
  563. 'Jobs will share nodes: ' + node)
  564. @timeout(450)
  565. def test_place_group(self):
  566. """
  567. Test node buckets with place=group
  568. """
  569. chunk = '1430:ncpus=1'
  570. a = {'Resource_List.select': chunk,
  571. 'Resource_List.place': 'scatter:excl:group=letter'}
  572. j = Job(TEST_USER, attrs=a)
  573. jid = self.server.submit(j)
  574. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  575. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  576. ev = self.server.status(JOB, 'exec_vnode', id=jid)
  577. used_nodes = j.get_vnodes(ev[0]['exec_vnode'])
  578. n = self.server.status(NODE, 'resources_available.letter')
  579. s = [x['resources_available.letter']
  580. for x in n if x['id'] in used_nodes]
  581. self.assertEqual(len(set(s)), 1,
  582. "Job ran in more than one placement set")
  583. @timeout(450)
  584. def test_psets_spanning(self):
  585. """
  586. Request more nodes than available in one placement set and see
  587. the job span or not depending on the value of do_not_span_psets
  588. """
  589. a = {'node_group_key': 'shape', 'node_group_enable': 'True'}
  590. self.server.manager(MGR_CMD_SET, SERVER, a)
  591. a = {'do_not_span_psets': 'True'}
  592. self.server.manager(MGR_CMD_SET, SCHED, a, id='default')
  593. # request one more node than the largest placement set
  594. chunk = '1431:ncpus=1'
  595. a = {'Resource_List.select': chunk,
  596. 'Resource_List.place': 'scatter:excl'}
  597. j = Job(TEST_USER, attrs=a)
  598. jid = self.server.submit(j)
  599. a = {'job_state': 'Q', 'comment':
  600. (MATCH_RE, 'can\'t fit in the largest placement set, '
  601. 'and can\'t span psets')}
  602. self.server.expect(JOB, a, attrop=PTL_AND, id=jid)
  603. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  604. a = {'do_not_span_psets': 'False'}
  605. self.server.manager(MGR_CMD_SET, SCHED, a, id='default')
  606. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  607. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  608. ev = self.server.status(JOB, 'exec_vnode', id=jid)
  609. used_nodes = j.get_vnodes(ev[0]['exec_vnode'])
  610. n = self.server.status(NODE, 'resources_available.shape')
  611. s = [x['resources_available.shape']
  612. for x in n if x['id'] in used_nodes]
  613. self.assertGreater(len(set(s)), 1,
  614. "Job did not span properly")
  615. @timeout(450)
  616. def test_psets_queue(self):
  617. """
  618. Test that placement sets work for nodes associated with queues
  619. """
  620. a = {'node_group_key': 'shape', 'node_group_enable': 'True'}
  621. self.server.manager(MGR_CMD_SET, SERVER, a)
  622. a = {'queue_type': 'Execution', 'started': 'True', 'enabled': 'True'}
  623. self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='workq2')
  624. # Take the first 14 vnodes. This means there are two nodes per shape
  625. nodes = ['vnode[0]', 'vnode[1]', 'vnode[2]', 'vnode[3]', 'vnode[4]',
  626. 'vnode[5]', 'vnode[6]', 'vnode[7]', 'vnode[8]', 'vnode[9]',
  627. 'vnode[10]', 'vnode[11]', 'vnode[12]', 'vnode[13]']
  628. self.server.manager(MGR_CMD_SET, NODE, {'queue': 'workq2'}, id=nodes)
  629. chunk = '2:ncpus=1'
  630. a = {'Resource_List.select': chunk, 'queue': 'workq2',
  631. 'Resource_List.place': 'scatter:excl'}
  632. for _ in range(7):
  633. j = Job(TEST_USER, a)
  634. jid = self.server.submit(j)
  635. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  636. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  637. # Check to see if jobs ran in one placement set
  638. jobs = self.server.status(JOB)
  639. for job in jobs:
  640. ev = self.server.status(JOB, 'exec_vnode', id=job['id'])
  641. used_nodes = j.get_vnodes(ev[0]['exec_vnode'])
  642. n = self.server.status(NODE, 'resources_available.shape')
  643. s = [x['resources_available.shape']
  644. for x in n if x['id'] in used_nodes]
  645. self.assertEqual(len(set(s)), 1,
  646. "Job " + job['id'] +
  647. "ran in more than one placement set")
  648. s = self.server.select()
  649. for jid in s:
  650. self.server.delete(jid, wait=True)
  651. # Check to see of jobs span correctly
  652. chunk = '7:ncpus=1'
  653. a = {'Resource_List.select': chunk, 'queue': 'workq2',
  654. 'Resource_List.place': 'scatter:excl'}
  655. j = Job(TEST_USER, a)
  656. jid = self.server.submit(j)
  657. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  658. self.scheduler.log_match(jid + ';Chunk: ' + chunk, n=10000)
  659. ev = self.server.status(JOB, 'exec_vnode', id=jid)
  660. used_nodes = j.get_vnodes(ev[0]['exec_vnode'])
  661. n = self.server.status(NODE, 'resources_available.shape')
  662. s = [x['resources_available.shape']
  663. for x in n if x['id'] in used_nodes]
  664. self.assertGreater(len(set(s)), 1,
  665. "Job did not span properly")
  666. @timeout(450)
  667. def test_free(self):
  668. """
  669. Test that free placement works with the bucket code path
  670. """
  671. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'})
  672. chunk = '1430:ncpus=1:color=yellow'
  673. a = {'Resource_List.select': chunk,
  674. 'Resource_List.place': 'excl'}
  675. j1 = Job(TEST_USER, attrs=a)
  676. jid1 = self.server.submit(j1)
  677. j2 = Job(TEST_USER, attrs=a)
  678. jid2 = self.server.submit(j2)
  679. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  680. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  681. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  682. self.scheduler.log_match(jid1 + ';Chunk: ' + chunk, n=10000)
  683. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk, n=10000)
  684. s1 = self.server.status(JOB, 'exec_vnode', id=jid1)
  685. s2 = self.server.status(JOB, 'exec_vnode', id=jid2)
  686. n1 = j1.get_vnodes(s1[0]['exec_vnode'])
  687. n2 = j1.get_vnodes(s2[0]['exec_vnode'])
  688. msg = 'job did not run on correct number of nodes'
  689. self.assertEquals(len(n1), 715, msg)
  690. self.assertEquals(len(n2), 715, msg)
  691. for node in n1:
  692. self.assertTrue(node not in n2, 'Jobs share nodes: ' + node)
  693. @timeout(450)
  694. def test_queue_nodes(self):
  695. """
  696. Test that buckets work with nodes associated to a queue
  697. """
  698. v1 = 'vnode[1431]'
  699. v2 = 'vnode[1435]'
  700. a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True'}
  701. self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='q2')
  702. self.server.manager(MGR_CMD_SET, NODE, {'queue': 'q2'}, id=v1)
  703. self.server.manager(MGR_CMD_SET, NODE, {'queue': 'q2'}, id=v2)
  704. chunk1 = '1428:ncpus=1:color=orange'
  705. a = {'Resource_List.select': chunk1,
  706. 'Resource_List.place': 'scatter:excl'}
  707. j1 = Job(TEST_USER, attrs=a)
  708. jid1 = self.server.submit(j1)
  709. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  710. self.scheduler.log_match(jid1 + ';Chunk: ' + chunk1, n=10000)
  711. job = self.server.status(JOB, 'exec_vnode', id=jid1)[0]
  712. ev = j1.get_vnodes(job['exec_vnode'])
  713. msg = 'Job is using queue\'s nodes'
  714. self.assertNotIn(v1, ev)
  715. self.assertNotIn(v2, ev)
  716. chunk2 = '2:ncpus=1'
  717. a = {'Resource_List.select': chunk2,
  718. 'Resource_List.place': 'scatter:excl',
  719. 'queue': 'q2'}
  720. j2 = Job(TEST_USER, attrs=a)
  721. jid2 = self.server.submit(j2)
  722. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  723. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk2, n=10000)
  724. job = self.server.status(JOB, 'exec_vnode', id=jid2)[0]
  725. ev = j2.get_vnodes(job['exec_vnode'])
  726. msg = 'Job running on nodes not associated with queue'
  727. self.assertIn(v1, ev, msg)
  728. self.assertIn(v2, ev, msg)
  729. @timeout(450)
  730. def test_booleans(self):
  731. """
  732. Test that booleans are correctly handled if not in the sched_config
  733. resources line. This means that an unset boolean is considered false
  734. and that booleans that are True are considered even though they
  735. aren't on the resources line.
  736. """
  737. chunk1 = '2:ncpus=1'
  738. a = {'Resource_List.select': chunk1,
  739. 'Resource_List.place': 'scatter:excl'}
  740. j1 = Job(TEST_USER, attrs=a)
  741. jid1 = self.server.submit(j1)
  742. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  743. self.scheduler.log_match(jid1 + ';Chunk: ' + chunk1, n=10000)
  744. jst = self.server.status(JOB, 'exec_vnode', id=jid1)[0]
  745. ev = j1.get_vnodes(jst['exec_vnode'])
  746. for n in ev:
  747. self.server.expect(
  748. NODE, {'resources_available.bool': 'True'}, id=n)
  749. chunk2 = '2:ncpus=1:bool=False'
  750. a['Resource_List.select'] = chunk2
  751. j2 = Job(TEST_USER, attrs=a)
  752. jid2 = self.server.submit(j2)
  753. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  754. self.scheduler.log_match(jid2 + ';Chunk: ' + chunk2, n=10000)
  755. jst = self.server.status(JOB, 'exec_vnode', id=jid2)[0]
  756. ev = j2.get_vnodes(jst['exec_vnode'])
  757. for n in ev:
  758. self.server.expect(
  759. NODE, 'resources_available.bool', op=UNSET, id=n)
  760. @timeout(450)
  761. def test_last_pset_can_never_run(self):
  762. """
  763. Test that the job does not retain the error value of last placement
  764. set seen by the node bucketing code. To check this make sure that the
  765. last placement set check results into a 'can never run' case because
  766. resources do not match and check that the job is not marked as
  767. never run.
  768. """
  769. self.server.manager(MGR_CMD_CREATE, RSC,
  770. {'type': 'long', 'flag': 'nh'}, id='foo')
  771. self.server.manager(MGR_CMD_CREATE, RSC,
  772. {'type': 'string', 'flag': 'h'}, id='bar')
  773. self.server.manager(MGR_CMD_SET, SERVER, {'node_group_key': 'bar'})
  774. self.server.manager(MGR_CMD_SET, SERVER, {'node_group_enable': 'true'})
  775. self.mom.delete_vnode_defs()
  776. a = {'resources_available.ncpus': 80,
  777. 'resources_available.bar': 'large'}
  778. self.server.create_vnodes(name='vnode', attrib=a, num=8,
  779. mom=self.mom, sharednode=False)
  780. self.scheduler.add_resource('foo')
  781. a['resources_available.foo'] = 8
  782. a['resources_available.ncpus'] = 8
  783. a['resources_available.bar'] = 'small'
  784. for val in range(0, 5):
  785. vname = "vnode[" + str(val) + "]"
  786. self.server.manager(MGR_CMD_SET, NODE, a, id=vname)
  787. chunk1 = '4:ncpus=5:foo=5'
  788. a = {'Resource_List.select': chunk1,
  789. 'Resource_List.place': 'scatter:excl'}
  790. j1 = Job(TEST_USER, attrs=a)
  791. jid1 = self.server.submit(j1)
  792. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  793. j2 = Job(TEST_USER, attrs=a)
  794. jid2 = self.server.submit(j2)
  795. self.server.expect(JOB, {'job_state': 'Q'}, id=jid2)
  796. self.scheduler.log_match(jid2 + ';Job will never run',
  797. existence=False, max_attempts=10)