pbs_basil_support.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. from string import Template
  38. import os
  39. import defusedxml.ElementTree as ET
  40. @tags('cray', 'mom')
  41. class TestBasilQuery(TestFunctional):
  42. """
  43. This test suite is for testing the support for BASIL 1.7/1.4 basil
  44. query.Test if query is made with correct BASIL version, and that
  45. vnodes are getting created as per the query response.
  46. """
  47. basil_version = ['1.7', '1.4', '1.3']
  48. available_version = ""
  49. @staticmethod
  50. def init_inventory_node():
  51. node = {}
  52. node['vnode'] = ""
  53. node['arch'] = ""
  54. node['current_aoe'] = ""
  55. node['host'] = ""
  56. node['hbmem'] = ""
  57. node['mem'] = ""
  58. node['ncpus'] = ""
  59. node['PBScrayhost'] = ""
  60. node['PBScraynid'] = ""
  61. node['vntype'] = ""
  62. node['accelerator_memory'] = ""
  63. node['accelerator_model'] = ""
  64. node['naccelerators'] = ""
  65. return node
  66. def reset_nodes(self, hostA):
  67. # Remove all nodes
  68. self.server.manager(MGR_CMD_DELETE, NODE, None, "")
  69. # Restart PBS
  70. self.server.restart()
  71. # Create node
  72. self.server.manager(MGR_CMD_CREATE, NODE, None, hostA)
  73. # Wait for 3 seconds for changes to take effect
  74. time.sleep(3)
  75. def setUp(self):
  76. TestFunctional.setUp(self)
  77. self.server.manager(MGR_CMD_SET, PBS_HOOK,
  78. {'enabled': 'true', 'freq': 10},
  79. id='PBS_alps_inventory_check', expect=True)
  80. momA = self.moms.values()[0]
  81. if not momA.is_cray():
  82. self.skipTest("%s: not a cray mom." % (momA.shortname))
  83. mom_config = momA.parse_config()
  84. if '$alps_client' not in mom_config:
  85. self.skipTest("alps_client not set in mom config.")
  86. if '$vnode_per_numa_node' in mom_config:
  87. momA.unset_mom_config('$vnode_per_numa_node', False)
  88. momA.add_config({'$logevent': '0xffffffff'})
  89. # check if required BASIL version available on the machine.
  90. for ver in self.basil_version:
  91. xml_out = self.query_alps(ver, 'QUERY', 'ENGINE')
  92. xml_tree = ET.parse(xml_out)
  93. os.remove(xml_out)
  94. response = xml_tree.find(".//ResponseData")
  95. status = response.attrib['status']
  96. if status == "SUCCESS":
  97. self.available_version = ver
  98. break
  99. if self.available_version == "":
  100. self.skipTest("No supported basil version found on the platform.")
  101. # Reset nodes
  102. self.reset_nodes(momA.shortname)
  103. def query_alps(self, ver, method, qtype):
  104. """
  105. Send a query to ALPS of a certain type and return the xml output file.
  106. """
  107. basil_protocol = 'protocol="%s"' % (ver)
  108. basil_method = 'method="%s"' % (method)
  109. basil_qtype = 'type="%s"' % (qtype)
  110. queryt = Template('<BasilRequest $ver $method $qtype/>\n')
  111. query = queryt.substitute(ver=basil_protocol,
  112. method=basil_method, qtype=basil_qtype)
  113. mom_config = self.mom.parse_config()
  114. alps_client = mom_config['$alps_client']
  115. fn = self.du.create_temp_file(body=query)
  116. xout = self.du.create_temp_file()
  117. self.du.run_cmd(cmd="%s < %s > %s" % (alps_client, fn, xout),
  118. as_script=True)
  119. os.remove(fn)
  120. return xout
  121. def comp_node(self, vnode):
  122. """
  123. Check if compute node is found in pbsnodes -av output.
  124. If so check if the vnode attribute has the correct values.
  125. """
  126. name = vnode['vnode']
  127. try:
  128. pbs_node = self.server.status(NODE, id=name)[0]
  129. except PbsStatusError:
  130. self.assertFalse(pbs_node is None,
  131. "Cray compute node %s doesn't exist on pbs server"
  132. % (name))
  133. for rsc, xval in vnode.iteritems():
  134. if rsc != 'current_aoe':
  135. resource = 'resources_available.' + rsc
  136. else:
  137. resource = rsc
  138. if xval != "":
  139. if resource in pbs_node:
  140. rval = pbs_node[resource]
  141. if rval == xval:
  142. self.logger.info(
  143. "%s: node has %s=%s" % (name, rsc, rval))
  144. self.assertTrue(True)
  145. else:
  146. self.assertFalse("%s: node has %s=%s but XML %s=%s"
  147. % (name, resource, rval,
  148. rsc, xval))
  149. else:
  150. self.assertFalse(
  151. "%s\t: node has no resource %s" % (name, rsc))
  152. def get_knl_vnodes(self):
  153. xml_out = self.query_alps('1.7', 'QUERY', 'SYSTEM')
  154. tree = ET.parse(xml_out)
  155. os.remove(xml_out)
  156. root = tree.getroot()
  157. knl_vnodes = {}
  158. knl_info = {}
  159. # If node has the KNL processor then add them
  160. # to knl_vnodes dictionary
  161. for node in root.getiterator('Nodes'):
  162. # XML values
  163. role = node.attrib["role"]
  164. state = node.attrib["state"]
  165. numa_cfg = node.attrib["numa_cfg"]
  166. hbm_size_mb = node.attrib["hbm_size_mb"]
  167. hbm_cache_pct = node.attrib["hbm_cache_pct"]
  168. if role == 'batch' and state == 'up' and numa_cfg is not ""\
  169. and hbm_size_mb is not "" and hbm_cache_pct is not "":
  170. # derived values from XML
  171. knl_info['current_aoe'] = numa_cfg + '_' + hbm_cache_pct
  172. knl_info['hbmem'] = hbm_size_mb + 'mb'
  173. nid_ranges = node.text.strip()
  174. nid_range_list = list(nid_ranges.split(','))
  175. while len(nid_range_list) > 0:
  176. nid_range = nid_range_list.pop()
  177. nid1 = nid_range.split('-')
  178. if len(nid1) == 2:
  179. # range of nodes
  180. r1 = int(nid1[0])
  181. r2 = int(nid1[1]) + 1
  182. for node_id in range(r1, r2):
  183. # associate each nid with it's knl information
  184. knl_vnodes['%d' % node_id] = knl_info
  185. else:
  186. # single node
  187. node_id = int(nid1[0])
  188. knl_vnodes['%d' % node_id] = knl_info
  189. return knl_vnodes
  190. def retklist(self):
  191. """
  192. Return a list of KNL vnodes, empty list if there are no KNL vnodes.
  193. """
  194. klist = []
  195. # Find the list of KNL vnodes
  196. kvnl = self.server.filter(VNODE, {'current_aoe': (NE, "")})
  197. if len(kvnl) == 0:
  198. self.skipTest(reason='No KNL vnodes present')
  199. else:
  200. klist = kvnl.values()[0]
  201. self.logger.info("KNL vnode list: %s" % (klist))
  202. return klist
  203. def set_provisioning(self):
  204. """
  205. Set provisioning enabled and aoe resource on Xeon Phi nodes.
  206. """
  207. # Check for provisioning setup
  208. momA = self.moms.values()[0].shortname
  209. serverA = self.servers.values()[0].shortname
  210. msg = ("Provide a mom not present on server host while invoking"
  211. " the test: -p moms=<m1>")
  212. if momA == serverA:
  213. self.skipTest(reason=msg)
  214. nodelist = self.server.status(NODE, 'current_aoe')
  215. for node in nodelist:
  216. a = {'provision_enable': 'true',
  217. 'resources_available.aoe': '%s' % node['current_aoe']}
  218. self.server.manager(MGR_CMD_SET, NODE,
  219. a, id=node['id'], expect=True)
  220. def unset_provisioning(self):
  221. """
  222. Unset provisioning attribute and aoe resource on Xeon Phi nodes.
  223. """
  224. nodelist = self.server.status(NODE, 'current_aoe')
  225. for node in nodelist:
  226. a = ['provision_enable',
  227. 'resources_available.aoe']
  228. self.server.manager(MGR_CMD_UNSET, NODE,
  229. a, id=node['id'], expect=True)
  230. def request_current_aoe(self):
  231. """
  232. Get the value of current_aoe set on the XeonPhi vnodes
  233. """
  234. aoe_val = self.server.status(NODE, 'current_aoe')
  235. req_aoe = aoe_val[0]['current_aoe']
  236. return req_aoe
  237. def test_InventoryQueryVersion(self):
  238. """
  239. Test if BASIL version is set to required BASIL version
  240. on cray/simulator platform.
  241. """
  242. self.mom.signal('-HUP')
  243. engine_query_log = "<BasilRequest protocol=\"%s\" method=\"QUERY\" \
  244. type=\"ENGINE\"/>" % (self.basil_version[1])
  245. self.mom.log_match(engine_query_log, n='ALL', max_attempts=3)
  246. if self.available_version == '1.7':
  247. msg = 'This Cray system supports the BASIL 1.7 protocol'
  248. self.mom.log_match(msg, n='ALL', max_attempts=3)
  249. basil_version_log = 'alps_engine_query;The basilversion is' \
  250. ' set to 1.4'
  251. else:
  252. basil_version_log = 'alps_engine_query;The basilversion is' \
  253. ' set to ' + self.available_version
  254. self.mom.log_match(basil_version_log, max_attempts=3)
  255. def test_InventoryVnodes(self):
  256. """
  257. This test validates the vnode created using alps BASIL 1.4 & 1.7
  258. inventory query response.
  259. """
  260. knl_vnodes = {}
  261. # Parse inventory query response and fetch node information.
  262. xml_out = self.query_alps('1.4', 'QUERY', 'INVENTORY')
  263. xml_tree = ET.parse(xml_out)
  264. os.remove(xml_out)
  265. inventory_1_4_el = xml_tree.find(".//Inventory")
  266. hn = inventory_1_4_el.attrib["mpp_host"]
  267. if self.available_version == '1.7':
  268. knl_vnodes = self.get_knl_vnodes()
  269. # Fill vnode structure using BASIL response
  270. for node in inventory_1_4_el.getiterator('Node'):
  271. role = node.attrib["role"]
  272. if role == 'BATCH':
  273. # XML values
  274. node_id = node.attrib["node_id"]
  275. cu_el = node.findall('.//ComputeUnit')
  276. mem_el = node.findall('.//Memory')
  277. ac_el = node.findall('.//Accelerator')
  278. page_size_kb = mem_el[0].attrib["page_size_kb"]
  279. page_count = mem_el[0].attrib["page_count"]
  280. vnode = self.init_inventory_node()
  281. vnode['arch'] = node.attrib['architecture']
  282. vnode['vnode'] = hn + '_' + node_id
  283. vnode['vntype'] = "cray_compute"
  284. vnode['mem'] = str(int(page_size_kb) *
  285. int(page_count) * len(mem_el)) + "kb"
  286. vnode['host'] = vnode['vnode']
  287. vnode['PBScraynid'] = node_id
  288. vnode['PBScrayhost'] = hn
  289. vnode['ncpus'] = str(len(cu_el))
  290. if ac_el:
  291. vnode['naccelerators'] = str(len(ac_el))
  292. vnode['accelerator_memory'] = str(
  293. ac_el[0].attrib['memory_mb']) + "mb"
  294. vnode['accelerator_model'] = ac_el[0].attrib['family']
  295. if node_id in knl_vnodes:
  296. vnode['hbmem'] = knl_vnodes[node_id]['hbmem']
  297. vnode['current_aoe'] = knl_vnodes[node_id]['current_aoe']
  298. vnode['vnode'] = hn + '_' + node_id
  299. # Compare xml vnode with pbs node.
  300. self.logger.info("Validating vnode:%s" % (vnode['vnode']))
  301. self.comp_node(vnode)
  302. def test_cray_login_node(self):
  303. """
  304. This test validates that cray mom node resources value remain
  305. unchanged before and after adding $alps_client in mom config.
  306. """
  307. mom_id = self.mom.shortname
  308. try:
  309. cray_login_node = self.server.status(NODE, id=mom_id)[0]
  310. self.mom.unset_mom_config('$alps_client', False)
  311. self.reset_nodes(mom_id)
  312. pbs_node = self.server.status(NODE, id=mom_id)[0]
  313. except PbsStatusError:
  314. self.assertFalse(True,
  315. "Mom node %s doesn't exist on pbs server"
  316. % (mom_id))
  317. # List of resources to be ignored while comparing.
  318. ignr_rsc = ['license', 'last_state_change_time']
  319. for rsc, val in pbs_node.iteritems():
  320. if rsc in ignr_rsc:
  321. continue
  322. self.assertTrue(rsc in cray_login_node,
  323. ("%s\t: login node has no rsc %s") %
  324. (mom_id, rsc))
  325. rval = cray_login_node[rsc]
  326. self.assertEqual(rval, val,
  327. ("%s\t: pbs node has %s=%s but login "
  328. "node has %s=%s") %
  329. (mom_id, rsc, val, rsc, rval))
  330. def test_hbmemm_rsc(self):
  331. """
  332. Create a job that requests enough HBMEM. Submit the job to
  333. the Server. Check if the job is in the 'R' state and if the
  334. job runs on a KNL vnode. Delete the job.
  335. """
  336. knl_vnodes = self.get_knl_vnodes()
  337. if len(knl_vnodes) == 0:
  338. self.skipTest(reason='No KNL vnodes present')
  339. else:
  340. self.logger.info("KNL vnode list: %s" % (knl_vnodes))
  341. hbm_req = 4192
  342. a = {'Resource_List.select': '1:hbmem=%dmb' % hbm_req}
  343. job = Job(TEST_USER, attrs=a)
  344. job_id = self.server.submit(job)
  345. self.server.expect(JOB, {'job_state': 'R'}, id=job_id)
  346. # Check that exec_vnode is a KNL vnode.`
  347. self.server.status(JOB, 'exec_vnode', id=job_id)
  348. evnode = job.execvnode()[0].keys()[0]
  349. nid = evnode.split('_')[1]
  350. if nid in knl_vnodes.keys():
  351. self.logger.info("exec_vnode %s is a KNL vnode." % (evnode))
  352. rv = 1
  353. else:
  354. self.logger.info("exec_vnode %s is not a KNL vnode." % (evnode))
  355. rv = 0
  356. self.assertTrue(rv == 1)
  357. nodes = self.server.status(NODE)
  358. for n in nodes:
  359. v_name = n['id']
  360. if v_name == evnode:
  361. hbm_assig = n['resources_assigned.hbmem']
  362. hbm_int = int(re.search(r'\d+', hbm_assig).group())
  363. hbm_in_kb = hbm_req * 1024
  364. self.logger.info(
  365. "vnode name=%s -- hbm assigned=%s -- hbm requested=%dkb"
  366. % (v_name, hbm_assig, hbm_in_kb))
  367. if hbm_int == hbm_in_kb:
  368. self.logger.info(
  369. "The requested hbmem of %s mb has been assigned." %
  370. (str(hbm_req)))
  371. self.assertTrue(True)
  372. else:
  373. self.logger.info(
  374. "The assigned hbmem of %s, on %s, does not match "
  375. "requested hbmem of %d mb" %
  376. (hbm_assig, v_name, hbm_req))
  377. self.assertTrue(False)
  378. def test_job_request_insufficent_hbmemm_rsc(self):
  379. """
  380. Submit a job request that requests more than available HBMEM.
  381. Check if the job is in the 'Q' state with valid comment.
  382. Delete the job
  383. """
  384. # Find the list of KNL vnodes
  385. knl_vnodes = self.get_knl_vnodes()
  386. if len(knl_vnodes) == 0:
  387. self.skipTest(reason='No KNL vnodes present')
  388. else:
  389. self.logger.info("KNL vnode list: %s" % (knl_vnodes))
  390. hbm_req = 18000
  391. a = {'Resource_List.select': '1:hbmem=%dmb' % hbm_req}
  392. job = Job(TEST_USER, attrs=a)
  393. job_id = self.server.submit(job)
  394. # Check that job is in Q state with valid comment
  395. job_comment = "Not Running: Insufficient amount of resource: hbmem"
  396. self.server.expect(JOB, {'job_state': 'Q', 'comment':
  397. (MATCH_RE, job_comment)}, attrop=PTL_AND,
  398. id=job_id)
  399. def test_job_request_knl(self):
  400. """
  401. Create a job that requests aoe should run on a KNL vnode.
  402. Submit the job to the Server. Check if the job runs on a KNL vnode
  403. and if the job is in the 'R' state.
  404. """
  405. if self.du.platform == 'craysim':
  406. self.skipTest(reason='Test is not applicable for Craysim')
  407. # Find the list of KNL vnodes
  408. klist = self.retklist()
  409. # Set provisioning attributes on KNL vnode.
  410. self.set_provisioning()
  411. # Submit job that request aoe
  412. req_aoe = self.request_current_aoe()
  413. job = Job(TEST_USER)
  414. job.create_script(
  415. "#PBS -joe -o localhost:/tmp -lselect=1:ncpus=1:aoe=%s\n"
  416. % req_aoe +
  417. " cd /tmp\n"
  418. "aprun -B sleep 10\n"
  419. "sleep 10")
  420. job_id = self.server.submit(job)
  421. self.server.expect(JOB, {'job_state': 'R'}, id=job_id)
  422. # Check that exec_vnode is a KNL vnode.
  423. self.server.status(JOB, 'exec_vnode', id=job_id)
  424. evnode = job.get_vnodes()[0]
  425. self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode."
  426. % (evnode))
  427. self.logger.info("exec_vnode %s is a KNL vnode." % (evnode))
  428. # Unset provisioning attributes.
  429. self.unset_provisioning()
  430. def test_job_request_subchunk(self):
  431. """
  432. Test job request consist of subchunks with and without aoe resource.
  433. """
  434. if self.du.platform == 'craysim':
  435. self.skipTest(reason='Test is not applicable for craysim')
  436. # Find the list of KNL vnodes
  437. klist = self.retklist()
  438. # Set provisioning attributes.
  439. self.set_provisioning()
  440. # Submit job that request sub-chunk with and without aoe resources
  441. req_aoe = self.request_current_aoe()
  442. job = Job(TEST_USER)
  443. job.create_script(
  444. "#PBS -joe -o localhost:/tmp -lplace=scatter "
  445. "-lselect=1:ncpus=1:aoe=%s+1:ncpus=1\n" % req_aoe +
  446. " cd /tmp\n"
  447. "aprun -B sleep 10\n"
  448. "sleep 10")
  449. job_id = self.server.submit(job)
  450. self.server.expect(JOB, {'job_state': 'R'}, id=job_id)
  451. # Check that exec_vnode is a KNL vnode.
  452. self.server.status(JOB, 'exec_vnode', id=job_id)
  453. evnode = job.get_vnodes()
  454. self.assertIn(evnode[0], klist, "exec_vnode %s is not a KNL vnode."
  455. % (evnode[0]))
  456. self.logger.info("exec_vnode %s is a KNL vnode." % (evnode[0]))
  457. self.assertNotIn(evnode[1], klist, "exec_vnode %s is a KNL"
  458. " vnode." % (evnode[1]))
  459. self.logger.info("exec_vnode %s is not a KNL vnode." % (evnode[1]))
  460. # Unset provisioning attributes.
  461. self.unset_provisioning()
  462. def test_pbs_alps_in_sync(self):
  463. """
  464. Check for the presence of message indicating PBS and ALPS are
  465. in sync.
  466. """
  467. # Determine if BASIL 1.7 is supported.
  468. try:
  469. rv = self.mom.log_match(
  470. "This Cray system supports the BASIL 1.7 protocol.",
  471. n='ALL', max_attempts=10)
  472. except PtlLogMatchError:
  473. self.skipTest(
  474. reason='Test not applicable for system not having BASIL 1.7')
  475. # Determine if KNL vnodes are present.
  476. knl_vnodes = self.get_knl_vnodes()
  477. if len(knl_vnodes) == 0:
  478. self.skipTest(reason='No KNL vnodes present')
  479. else:
  480. self.logger.info("KNL vnode list: %s" % (knl_vnodes))
  481. # Check for PBS ALPS Inventory Hook message.
  482. now = int(time.time())
  483. rv = self.mom.log_match("ALPS Inventory Check: PBS and ALPS"
  484. " are in sync",
  485. starttime=now, interval=5)
  486. self.assertTrue(rv)
  487. def test_knl_batch_to_interactive(self):
  488. """
  489. Change the mode of any two KNL nodes to interactive. Then check if the
  490. PBS_alps_inventory_check hook picks up on the change and nodes are
  491. marked as stale. Restore changes to hook and mode of KNL nodes.
  492. """
  493. if self.du.platform == 'craysim':
  494. self.skipTest(reason='xtprocadmin cmd is not on cray simulator')
  495. # Find the list of KNL vnodes
  496. klist = self.retklist()
  497. # Change mode of two KNL nodes to interactive
  498. if len(klist) >= 2:
  499. k1 = klist[0]
  500. k2 = klist[len(klist) - 1]
  501. knl1 = re.search(r'\d+', k1).group()
  502. knl2 = re.search(r'\d+', k2).group()
  503. cmd = ['xtprocadmin', '-k', 'm', 'interactive', '-n', knl1]
  504. ret = self.server.du.run_cmd(self.server.hostname,
  505. cmd, logerr=True)
  506. self.assertEqual(ret['rc'], 0)
  507. cmd = ['xtprocadmin', '-k', 'm', 'interactive', '-n', knl2]
  508. ret = self.server.du.run_cmd(self.server.hostname,
  509. cmd, logerr=True)
  510. self.assertEqual(ret['rc'], 0)
  511. # Do Mom HUP
  512. self.mom.signal('-HUP')
  513. # Check that the nodes are now stale.
  514. self.server.expect(VNODE, {'state': 'Stale'}, id=k1,
  515. max_attempts=10, interval=5)
  516. self.server.expect(VNODE, {'state': 'Stale'}, id=k2)
  517. # Change nodes back to batch mode
  518. cmd = ['xtprocadmin', '-k', 'm', 'batch']
  519. ret = self.server.du.run_cmd(self.server.hostname,
  520. cmd, logerr=True)
  521. self.assertEqual(ret['rc'], 0)
  522. # Do Mom HUP
  523. self.mom.signal('-HUP')
  524. # Check that the nodes are now free.
  525. self.server.expect(VNODE, {'state': 'free'}, id=k1,
  526. max_attempts=10, interval=5)
  527. self.server.expect(VNODE, {'state': 'free'}, id=k2)
  528. def test_job_run_on_knl_node(self):
  529. """
  530. Change the mode of KNL nodes to batch.
  531. Then check if the PBS_alps_inventory_check hook picks up on the change.
  532. Submit job and confirm job should be in R state
  533. """
  534. if self.du.platform == 'craysim':
  535. self.skipTest(reason='xtprocadmin cmd is not on cray simulator')
  536. # Find the list of KNL vnodes
  537. klist = self.retklist()
  538. # Change mode of all nodes to interactive
  539. cmd = ['xtprocadmin', '-k', 'm', 'interactive']
  540. ret = self.server.du.run_cmd(self.server.hostname,
  541. cmd, logerr=True)
  542. self.assertEqual(ret['rc'], 0)
  543. # Change mode of two KNL nodes to batch
  544. if len(klist) >= 2:
  545. k1 = klist[0]
  546. k2 = klist[len(klist) - 1]
  547. knl1 = re.search(r'\d+', k1).group()
  548. knl2 = re.search(r'\d+', k2).group()
  549. cmd = ['xtprocadmin', '-k', 'm', 'batch', '-n', knl1]
  550. ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True)
  551. self.assertEqual(ret['rc'], 0)
  552. cmd = ['xtprocadmin', '-k', 'm', 'batch', '-n', knl2]
  553. ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True)
  554. self.assertEqual(ret['rc'], 0)
  555. # Do Mom HUP
  556. self.mom.signal('-HUP')
  557. # Check that the nodes are Free.
  558. self.server.expect(VNODE, {'state': 'free'}, id=k1, max_attempts=10,
  559. interval=5)
  560. self.server.expect(VNODE, {'state': 'free'}, id=k2)
  561. # Submit few jobs
  562. a = {'Resource_List.select': '1:vntype=cray_compute'}
  563. job = Job(TEST_USER, attrs=a)
  564. job_id = self.server.submit(job)
  565. self.server.expect(JOB, {'job_state': 'R'}, id=job_id)
  566. # Check that exec_vnode is a KNL vnode.
  567. self.server.status(JOB, 'exec_vnode', id=job_id)
  568. evnode = job.get_vnodes()[0]
  569. self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode."
  570. % (evnode))
  571. self.logger.info("exec_vnode %s is a KNL vnode." % (evnode))
  572. job2 = Job(TEST_USER, attrs=a)
  573. job_id2 = self.server.submit(job2)
  574. self.server.expect(JOB, {'job_state': 'R'}, id=job_id2)
  575. # Check that exec_vnode is a KNL vnode.
  576. self.server.status(JOB, 'exec_vnode', id=job_id2)
  577. evnode = job2.get_vnodes()[0]
  578. self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode."
  579. % (evnode))
  580. self.logger.info("exec_vnode %s is a KNL vnode." % (evnode))
  581. job3 = Job(TEST_USER, attrs=a)
  582. job_id3 = self.server.submit(job3)
  583. self.server.expect(JOB, {'job_state': 'Q'}, id=job_id3)
  584. # Delete the Job1.
  585. self.server.delete(job_id, wait=True)
  586. # Verify Job3 should start running
  587. self.server.expect(JOB, {'job_state': 'R'}, id=job_id3)
  588. # Check that exec_vnode is a KNL vnode.
  589. self.server.status(JOB, 'exec_vnode', id=job_id3)
  590. evnode = job3.get_vnodes()[0]
  591. self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode."
  592. % (evnode))
  593. self.logger.info("exec_vnode %s is a KNL vnode." % (evnode))
  594. def test_validate_pbs_xeon_phi_provision_hook(self):
  595. """
  596. Verify the default attribute of pbs_hook PBS_xeon_phi_provision hook.
  597. """
  598. if self.du.platform != 'cray':
  599. self.skipTest(reason='pbs_hook PBS_xeon_phi_provision is not'
  600. ' available on non-cray machine')
  601. attr = {'type': 'pbs', 'enabled': 'false', 'event': 'provision',
  602. 'alarm': 1800, 'order': 1, 'debug': 'false',
  603. 'user': 'pbsadmin', 'fail_action': 'none'}
  604. self.server.manager(MGR_CMD_LIST, PBS_HOOK,
  605. attr, id='PBS_xeon_phi_provision', expect=True)
  606. self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'true',
  607. 'alarm': 1000},
  608. id='PBS_xeon_phi_provision',
  609. expect=True)
  610. self.server.manager(MGR_CMD_LIST, PBS_HOOK, {'enabled': 'true',
  611. 'alarm': 1000},
  612. id='PBS_xeon_phi_provision',
  613. expect=True)
  614. # Reset pbs_hook value to default PBS_xeon_phi_provision hook
  615. self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'false',
  616. 'alarm': 1800},
  617. id='PBS_xeon_phi_provision',
  618. expect=True)
  619. self.server.manager(MGR_CMD_LIST, PBS_HOOK,
  620. attr, id='PBS_xeon_phi_provision',
  621. expect=True)
  622. def tearDown(self):
  623. TestFunctional.tearDown(self)
  624. if self.du.platform == 'cray':
  625. # Change all nodes back to batch mode and restart PBS
  626. cmd = ['xtprocadmin', '-k', 'm', 'batch']
  627. self.logger.info(cmd)
  628. ret = self.server.du.run_cmd(self.server.hostname,
  629. cmd, logerr=True)
  630. self.assertEqual(ret['rc'], 0)
  631. # Restore hook freq to 300
  632. self.server.manager(MGR_CMD_SET, PBS_HOOK,
  633. {'enabled': 'true', 'freq': 300},
  634. id='PBS_alps_inventory_check', expect=True)
  635. # Do Mom HUP
  636. self.mom.signal('-HUP')