pbs_power_provisioning_cray.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. import json
  38. from subprocess import Popen, PIPE
  39. import time
  40. class Test_power_provisioning_cray(TestFunctional):
  41. """
  42. Test power provisioning feature for the CRAY platform.
  43. """
  44. def setUp(self):
  45. """
  46. Use the MOM's that are already setup or define the ones passed in.
  47. """
  48. TestFunctional.setUp(self)
  49. pltfom = self.du.get_platform()
  50. if pltfom != 'cray':
  51. self.skipTest("%s: not a cray")
  52. self.mom.add_config({"logevent": "0xfffffff"})
  53. a = {'log_events': '2047'}
  54. self.server.manager(MGR_CMD_SET, SERVER, a)
  55. self.nids = []
  56. self.names = []
  57. for n in self.server.status(NODE):
  58. if 'resources_available.PBScraynid' in n:
  59. self.names.append(n['id'])
  60. craynid = n['resources_available.PBScraynid']
  61. self.nids.append(craynid)
  62. self.enable_power() # enable hooks
  63. def modify_hook_config(self, attrs, hook_id):
  64. """
  65. Modify the hook config file contents
  66. """
  67. conf_file = str(hook_id) + '.CF'
  68. conf_file_path = os.path.join(self.server.pbs_conf['PBS_HOME'],
  69. 'server_priv', 'hooks', conf_file)
  70. with open(conf_file_path) as data_file:
  71. data = json.load(data_file)
  72. for key, value in attrs.iteritems():
  73. data[key] = value
  74. with open(conf_file_path, 'w') as fp:
  75. json.dump(data, fp)
  76. a = {'enabled': 'True'}
  77. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id=hook_id, sudo=True)
  78. def setup_cray_eoe(self):
  79. """
  80. Setup a eoe list for all the nodes.
  81. Get possible values for pcaps using capmc command.
  82. """
  83. for n in self.server.status(NODE):
  84. if 'resources_available.PBScraynid' in n:
  85. self.server.manager(MGR_CMD_SET, NODE,
  86. {"power_provisioning": True}, n['id'])
  87. # Dividing total number of nodes by 3 and setting each part to a
  88. # different power profile , which will be used to submit jobs with
  89. # chunks matching to the number of nodes set to each profile
  90. self.npp = len(self.names) / 3
  91. for i in xrange(len(self.names)):
  92. if i in range(0, self.npp):
  93. self.server.manager(MGR_CMD_SET, NODE,
  94. {"resources_available.eoe": 'low'},
  95. self.names[i])
  96. if i in range(self.npp, self.npp * 2):
  97. self.server.manager(MGR_CMD_SET, NODE,
  98. {"resources_available.eoe": 'med'},
  99. self.names[i])
  100. if i in range(self.npp * 2, self.npp * 3):
  101. self.server.manager(MGR_CMD_SET, NODE,
  102. {"resources_available.eoe": 'high'},
  103. self.names[i])
  104. # Find nid range for capmc command
  105. cmd = "/opt/cray/capmc/default/bin/capmc "\
  106. "get_power_cap_capabilities --nids " + ','.join(self.nids)
  107. p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
  108. (o, e) = p.communicate()
  109. out = json.loads(o)
  110. low = 0
  111. med = 0
  112. high = 0
  113. rv = 'groups' in out
  114. msg = "Error while creating hook content from capmc output: " + cmd
  115. self.assertTrue(rv, msg)
  116. for group in out['groups']:
  117. for control in group['controls']:
  118. if control['name'] == 'node':
  119. min_cap = control['min']
  120. max_cap = control['max']
  121. pcap_list = {}
  122. for nid in group['nids']:
  123. pcap_list[nid] = {}
  124. pcap_list[nid]['min'] = min_cap
  125. pcap_list[nid]['max'] = max_cap
  126. if low == 0 or low < min_cap:
  127. low = min_cap
  128. if high == 0 or high > max_cap:
  129. high = max_cap
  130. # Get the med using mean of low and high
  131. med = (low + high) / 2
  132. # Now create the map_eoe hook file
  133. hook_content = """
  134. import pbs
  135. e = pbs.event()
  136. j = e.job
  137. profile = j.Resource_List['eoe']
  138. if profile is None:
  139. res = j.Resource_List['select']
  140. if res is not None:
  141. for s in str(res).split('+')[0].split(':'):
  142. if s[:4] == 'eoe=':
  143. profile = s.partition('=')[2]
  144. break
  145. pbs.logmsg(pbs.LOG_DEBUG, "got profile '%s'" % str(profile))
  146. if profile == "low":
  147. j.Resource_List["pcap_node"] = LOW_PCAP
  148. pbs.logmsg(pbs.LOG_DEBUG, "set low")
  149. elif profile == "med":
  150. j.Resource_List["pcap_node"] = MED_PCAP
  151. pbs.logmsg(pbs.LOG_DEBUG, "set med")
  152. elif profile == "high":
  153. j.Resource_List["pcap_node"] = HIGH_PCAP
  154. pbs.logmsg(pbs.LOG_DEBUG, "set high")
  155. else:
  156. pbs.logmsg(pbs.LOG_DEBUG, "unhandled profile '%s'" % str(profile))
  157. e.accept()
  158. """
  159. hook_content = hook_content.replace('LOW_PCAP', str(low))
  160. hook_content = hook_content.replace('MED_PCAP', str(med))
  161. hook_content = hook_content.replace('HIGH_PCAP', str(high))
  162. hook_name = "map_eoe"
  163. a = {'event': 'queuejob', 'enabled': 'true'}
  164. rv = self.server.create_import_hook(hook_name, a, hook_content)
  165. msg = "Error while creating and importing hook contents"
  166. self.assertTrue(rv, msg)
  167. msg = "Hook %s created and " % hook_name
  168. msg += "hook script is imported successfully"
  169. self.logger.info(msg)
  170. def enable_power(self):
  171. """
  172. Enable power_provisioning on the server.
  173. """
  174. a = {'enabled': 'True'}
  175. hook_name = "PBS_power"
  176. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id=hook_name,
  177. sudo=True)
  178. # check that hook becomes active
  179. nodes = self.server.status(NODE)
  180. n = nodes[0]
  181. host = n['Mom']
  182. self.assertTrue(host is not None)
  183. mom = self.moms[host]
  184. mom.log_match(
  185. "Hook;PBS_power.HK;copy hook-related file request received",
  186. starttime=self.server.ctime)
  187. def disable_power(self):
  188. """
  189. Disable power_provisioning on the server.
  190. """
  191. a = {'enabled': 'False'}
  192. hook_name = "PBS_power"
  193. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id=hook_name,
  194. sudo=True)
  195. def submit_job(self, secs=10, a={}):
  196. """
  197. secs: sleep time for the job
  198. a: any job attributes
  199. """
  200. a['Keep_Files'] = 'oe'
  201. j = Job(TEST_USER, attrs=a)
  202. j.set_sleep_time(secs)
  203. self.logger.info(str(j))
  204. jid = self.server.submit(j)
  205. self.job = j
  206. return jid
  207. def energy_check(self, jid):
  208. s = self.server.accounting_match("E;%s;.*" % jid,
  209. regexp=True)
  210. self.assertTrue(s is not None)
  211. # got the account record, hack it apart
  212. for resc in s[1].split(';')[3].split():
  213. if resc.partition('=')[0] == "resources_used.energy":
  214. return True
  215. return False
  216. def mom_logcheck(self, msg, jid=None):
  217. mom = self.moms[self.host] # top mom
  218. if jid is not None:
  219. mom.log_match(msg % jid,
  220. regexp=True, starttime=self.server.ctime,
  221. max_attempts=10)
  222. else:
  223. mom.log_match(msg,
  224. regexp=True, starttime=self.server.ctime,
  225. max_attempts=10)
  226. def eoe_check(self, jid, eoe, secs):
  227. # check that job is running and that the vnode has current_eoe set
  228. # check for the appropriate log messages for cray
  229. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  230. qstat = self.server.status(JOB, id=jid)
  231. nodes = self.job.get_vnodes(self.job.exec_vnode)
  232. for vname in nodes:
  233. self.server.expect(VNODE, {'current_eoe': eoe}, id=vname)
  234. self.server.expect(JOB, 'queue', op=UNSET, id=jid, offset=secs)
  235. self.host = qstat[0]['exec_host'].partition('/')[0]
  236. self.mom_logcheck("capmc get_node_energy_counter --nids")
  237. self.mom_logcheck(";Job;%s;energy usage", jid)
  238. self.mom_logcheck(";Job;%s;Cray: pcap node", jid)
  239. self.mom_logcheck("capmc set_power_cap --nids")
  240. self.mom_logcheck(";Job;%s;PMI: reset current_eoe", jid)
  241. self.mom_logcheck(";Job;%s;Cray: remove pcap node", jid)
  242. for vname in nodes:
  243. self.server.expect(VNODE, {'current_eoe': eoe}, id=vname, op=UNSET)
  244. def eoe_job(self, num, eoe):
  245. """
  246. Helper function to submit a job with an eoe value.
  247. Parameters:
  248. num: number of chunks
  249. eoe: profile name
  250. """
  251. secs = 10
  252. jid = self.submit_job(secs,
  253. {'Resource_List.place': 'scatter',
  254. 'Resource_List.select': '%d:eoe=%s' % (num,
  255. eoe)})
  256. self.eoe_check(jid, eoe, secs)
  257. return jid
  258. def cleanup_power_on(self):
  259. """
  260. cleanup by switching back all the nodes
  261. """
  262. capmc_cmd = os.path.join(
  263. os.sep, 'opt', 'cray', 'capmc', 'default', 'bin', 'capmc')
  264. self.du.run_cmd(self.server.hostname, [
  265. capmc_cmd, 'node_on', '--nids',
  266. ','.join(self.nids)], sudo=True)
  267. self.logger.info("Waiting for 15 mins to power on all the nodes")
  268. time.sleep(900)
  269. def cleanup_power_ramp_rate(self):
  270. """
  271. cleanup by ramping back all the nodes
  272. """
  273. for nid in self.nids:
  274. capmc_cmd = os.path.join(
  275. os.sep, 'opt', 'cray', 'capmc', 'default', 'bin', 'capmc')
  276. self.du.run_cmd(self.server.hostname, [
  277. capmc_cmd, 'set_sleep_state_limit', '--nids',
  278. str(nid), '--limit', '1'], sudo=True)
  279. self.logger.info("ramping up the node with nid" + str(nid))
  280. def setup_power_ramp_rate(self):
  281. """
  282. Offline the nodes which does not have sleep_state_capablities
  283. """
  284. self.offnodes = 0
  285. for n in self.server.status(NODE):
  286. if 'resources_available.PBScraynid' in n:
  287. nid = n['resources_available.PBScraynid']
  288. cmd = os.path.join(os.sep, 'opt', 'cray',
  289. 'capmc', 'default', 'bin', 'capmc')
  290. ret = self.du.run_cmd(self.server.hostname,
  291. [cmd,
  292. 'get_sleep_state_limit_capabilities',
  293. '--nids', str(nid)], sudo=True)
  294. try:
  295. out = json.loads(ret['out'][0])
  296. except Exception:
  297. out = None
  298. if out is not None:
  299. errno = out["e"]
  300. msg = out["err_msg"]
  301. if errno == 52 and msg == "Invalid exchange":
  302. self.offnodes = self.offnodes + 1
  303. a = {'state': 'offline'}
  304. self.server.manager(MGR_CMD_SET, NODE, a, id=n['id'])
  305. @timeout(700)
  306. def test_cray_eoe_job(self):
  307. """
  308. Submit jobs with an eoe value and check that messages are logged
  309. indicating PMI activity, and current_eoe and resources_used.energy
  310. get set.
  311. """
  312. self.setup_cray_eoe()
  313. eoes = ['low', 'med', 'high']
  314. for profile in eoes:
  315. jid = self.eoe_job(self.npp, profile)
  316. self.energy_check(jid)
  317. @timeout(700)
  318. def test_cray_request_more_eoe(self):
  319. """
  320. Submit jobs with available+1 eoe chunks and verify job comment.
  321. """
  322. self.setup_cray_eoe()
  323. x = self.npp + 1
  324. jid = self.submit_job(10,
  325. {'Resource_List.place': 'scatter',
  326. 'Resource_List.select': '%d:eoe=%s' % (x,
  327. 'high')})
  328. self.server.expect(JOB, {
  329. 'job_state': 'Q',
  330. 'comment': 'Not Running: No available resources on nodes'},
  331. id=jid)
  332. @timeout(700)
  333. def test_cray_eoe_job_multiple_eoe(self):
  334. """
  335. Submit jobs requesting multiple eoe and job should rejected by qsub.
  336. """
  337. self.setup_cray_eoe()
  338. a = {'Resource_List.place': 'scatter',
  339. 'Resource_List.select': '10:eoe=low+10:eoe=high'}
  340. j = Job(TEST_USER, attrs=a)
  341. j.set_sleep_time(10)
  342. jid = None
  343. try:
  344. jid = self.server.submit(j)
  345. except PbsSubmitError as e:
  346. self.assertTrue(
  347. 'Invalid provisioning request in chunk' in e.msg[0])
  348. self.assertFalse(jid)
  349. @timeout(700)
  350. def test_cray_server_prov_off(self):
  351. """
  352. Submit jobs requesting eoe when power provisioning unset on server
  353. and verify that jobs wont run.
  354. """
  355. self.setup_cray_eoe()
  356. eoes = ['low', 'med', 'high']
  357. a = {'enabled': 'False'}
  358. hook_name = "PBS_power"
  359. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id=hook_name,
  360. sudo=True)
  361. self.server.expect(SERVER, {'power_provisioning': 'False'})
  362. for profile in eoes:
  363. jid = self.submit_job(10,
  364. {'Resource_List.place': 'scatter',
  365. 'Resource_List.select': '%d:eoe=%s'
  366. % (self.npp, profile)})
  367. self.server.expect(JOB, {
  368. 'job_state': 'Q',
  369. 'comment': 'Not Running: No available resources on nodes'},
  370. id=jid)
  371. @timeout(700)
  372. def test_cray_node_prov_off(self):
  373. """
  374. Submit jobs requesting eoe and verify that jobs wont run on
  375. nodes where power provisioning is set to false.
  376. """
  377. self.setup_cray_eoe()
  378. eoes = ['med', 'high']
  379. # set power_provisioning to off where eoe is set to false
  380. for i in range(0, self.npp):
  381. a = {'power_provisioning': 'False'}
  382. self.server.manager(MGR_CMD_SET, NODE, a, id=self.names[i])
  383. for profile in eoes:
  384. jid = self.submit_job(10,
  385. {'Resource_List.place': 'scatter',
  386. 'Resource_List.select': '%d:eoe=%s'
  387. % (self.npp, profile)})
  388. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  389. jid_low = self.submit_job(10,
  390. {'Resource_List.place': 'scatter',
  391. 'Resource_List.select': '%d:eoe=%s'
  392. % (self.npp, 'low')})
  393. exp_comm = 'Not Running: Insufficient amount of resource: '
  394. exp_comm += 'vntype (cray_compute != cray_login)'
  395. self.server.expect(JOB, {
  396. 'job_state': 'Q',
  397. 'comment': exp_comm}, attrop=PTL_AND, id=jid_low)
  398. @timeout(700)
  399. def test_cray_job_preemption(self):
  400. """
  401. Submit job to a high priority queue and verify
  402. that job is preempted by requeueing.
  403. """
  404. self.setup_cray_eoe()
  405. self.server.manager(MGR_CMD_CREATE, QUEUE,
  406. {'queue_type': 'execution', 'started': 'True',
  407. 'enabled': 'True', 'priority': 150}, id='workq2')
  408. jid = self.submit_job(10,
  409. {'Resource_List.place': 'scatter',
  410. 'Resource_List.select': '%d:eoe=%s'
  411. % (self.npp, 'low')})
  412. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  413. t = int(time.time())
  414. jid_hp = self.submit_job(10, {ATTR_queue: 'workq2',
  415. 'Resource_List.place': 'scatter',
  416. 'Resource_List.select': '%d:eoe=%s' %
  417. (self.npp, 'low')})
  418. self.server.expect(JOB, {'job_state': 'R'}, id=jid_hp)
  419. self.server.expect(JOB, {'job_state': 'Q'}, id=jid)
  420. self.scheduler.log_match("Job preempted by requeuing", starttime=t)
  421. def test_power_provisioning_attribute(self):
  422. """
  423. Test that when hook is disabled power_provisioning on
  424. server is set to false and when enabled true.
  425. """
  426. self.enable_power()
  427. a = {'power_provisioning': 'True'}
  428. self.server.expect(SERVER, a)
  429. self.disable_power()
  430. a = {'power_provisioning': 'False'}
  431. self.server.expect(SERVER, a)
  432. def test_poweroff_eligible_atrribute(self):
  433. """
  434. Test that we can set poweroff_eligible for nodes to true/false.
  435. """
  436. nodes = self.server.status(NODE)
  437. host = nodes[0]['id']
  438. self.server.manager(MGR_CMD_SET, NODE, {'poweroff_eligible': 'True'},
  439. expect=True, id=host)
  440. self.server.manager(MGR_CMD_SET, NODE, {'poweroff_eligible': 'False'},
  441. expect=True, id=host)
  442. def test_last_state_change_time(self):
  443. """
  444. Test last_state_change_time is set when a job is run and is exited.
  445. """
  446. pattern = '%a %b %d %H:%M:%S %Y'
  447. self.server.manager(MGR_CMD_SET, SERVER, {
  448. 'job_history_enable': 'True'})
  449. nodes = self.server.status(NODE)
  450. vnode = nodes[0]['resources_available.vnode']
  451. ncpus = nodes[0]['resources_available.ncpus']
  452. vntype = nodes[0]['resources_available.vntype']
  453. jid = self.submit_job(5, {'Resource_List.vnode': vnode,
  454. 'Resource_List.ncpus': ncpus,
  455. 'Resource_List.vntype': vntype})
  456. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  457. status = self.server.status(NODE, id=vnode)
  458. fmttime = status[0][ATTR_NODE_last_state_change_time]
  459. sts_time1 = int(time.mktime(time.strptime(fmttime, pattern)))
  460. jid = self.submit_job(5, {'Resource_List.vnode': vnode,
  461. 'Resource_List.ncpus': ncpus,
  462. 'Resource_List.vntype': vntype})
  463. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  464. status = self.server.status(NODE, id=vnode)
  465. fmttime = status[0][ATTR_NODE_last_state_change_time]
  466. sts_time2 = int(time.mktime(time.strptime(fmttime, pattern)))
  467. self.assertGreater(sts_time2, sts_time1)
  468. def test_last_used_time(self):
  469. """
  470. Test last_used_time is set when a job is run and is exited.
  471. """
  472. pattern = '%a %b %d %H:%M:%S %Y'
  473. self.server.manager(MGR_CMD_SET, SERVER, {
  474. 'job_history_enable': 'True'})
  475. nodes = self.server.status(NODE)
  476. vnode = nodes[0]['resources_available.vnode']
  477. vntype = nodes[0]['resources_available.vntype']
  478. jid = self.submit_job(5, {'Resource_List.vnode': vnode,
  479. 'Resource_List.vntype': vntype})
  480. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  481. status = self.server.status(NODE, id=vnode)
  482. fmttime = status[0][ATTR_NODE_last_used_time]
  483. sts_time1 = int(time.mktime(time.strptime(fmttime, pattern)))
  484. jid = self.submit_job(5, {'Resource_List.vnode': vnode,
  485. 'Resource_List.vntype': vntype})
  486. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  487. status = self.server.status(NODE, id=vnode)
  488. fmttime = status[0][ATTR_NODE_last_used_time]
  489. sts_time2 = int(time.mktime(time.strptime(fmttime, pattern)))
  490. self.assertGreater(sts_time2, sts_time1)
  491. @timeout(1200)
  492. def test_power_off_nodes(self):
  493. """
  494. Test power hook will power off the nodes if power_on_off_enable when
  495. poweroff_eligible is set to true on nodes.
  496. """
  497. for n in self.server.status(NODE):
  498. if 'resources_available.PBScraynid' in n:
  499. self.server.manager(MGR_CMD_SET, NODE,
  500. {"poweroff_eligible": True}, n['id'])
  501. a = {"power_on_off_enable": True,
  502. "max_concurrent_nodes": "30", 'node_idle_limit': '30'}
  503. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  504. a = {'freq': 30}
  505. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  506. sudo=True)
  507. a = {'enabled': 'True'}
  508. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  509. sudo=True)
  510. t = int(time.time())
  511. self.logger.info("Waiting for 4 mins to power off all the nodes")
  512. time.sleep(240)
  513. self.server.log_match(
  514. "/opt/cray/capmc/default/bin/capmc node_off", starttime=t)
  515. # Expect sleep state on all nodes expect login node
  516. self.server.expect(
  517. NODE, {'state=sleep': len(self.server.status(NODE)) - 1})
  518. self.cleanup_power_on()
  519. @timeout(1200)
  520. def test_power_on_off_max_concurennt_nodes(self):
  521. """
  522. Test power hook will power off the only the number of
  523. max_concurrent nodes specified in conf file per hook run
  524. even when poweroff_eligible is set to true on all the nodes.
  525. """
  526. for n in self.server.status(NODE):
  527. if 'resources_available.PBScraynid' in n:
  528. self.server.manager(MGR_CMD_SET, NODE,
  529. {"poweroff_eligible": True}, n['id'])
  530. a = {"power_on_off_enable": True, 'node_idle_limit': '10',
  531. 'max_concurrent_nodes': '2'}
  532. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  533. a = {'freq': 30}
  534. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  535. sudo=True)
  536. a = {'enabled': 'True'}
  537. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  538. sudo=True)
  539. self.logger.info("Waiting for 40 secs to power off 2 nodes")
  540. time.sleep(40)
  541. a = {'enabled': 'False'}
  542. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  543. sudo=True)
  544. self.server.expect(NODE, {'state=sleep': 2})
  545. self.cleanup_power_on()
  546. def test_poweroffelgible_false(self):
  547. """
  548. Test hook wont power off the nodes where
  549. poweroff_eligible is set to false
  550. """
  551. for n in self.server.status(NODE):
  552. if 'resources_available.PBScraynid' in n:
  553. self.server.manager(MGR_CMD_SET, NODE,
  554. {"poweroff_eligible": False}, n['id'])
  555. a = {"power_on_off_enable": True,
  556. "max_concurrent_nodes": "30", 'node_idle_limit': '30'}
  557. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  558. a = {'freq': 30}
  559. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  560. sudo=True)
  561. a = {'enabled': 'True'}
  562. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  563. sudo=True)
  564. self.logger.info(
  565. "Waiting for 100 secs to make sure no nodes are powered off")
  566. time.sleep(100)
  567. self.server.expect(NODE, {'state=free': len(self.server.status(NODE))})
  568. @timeout(900)
  569. def test_power_on_nodes(self):
  570. """
  571. Test when a job is calandered on a vnode which is in sleep state,
  572. the node will be powered on and job will run.
  573. """
  574. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  575. self.server.manager(MGR_CMD_SET, NODE, {
  576. "poweroff_eligible": True}, self.names[0])
  577. a = {"power_on_off_enable": True, 'node_idle_limit': '30'}
  578. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  579. a = {'freq': 30}
  580. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  581. sudo=True)
  582. a = {'enabled': 'True'}
  583. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  584. sudo=True)
  585. t = int(time.time())
  586. self.logger.info("Waiting for 1 min to poweroff 1st node")
  587. time.sleep(60)
  588. self.server.log_match(
  589. "/opt/cray/capmc/default/bin/capmc node_off", starttime=t)
  590. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[0])
  591. a = {"node_idle_limit": "1800", 'min_node_down_delay': '30'}
  592. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  593. t = int(time.time())
  594. jid = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  595. self.scheduler.log_match(
  596. jid + ';Job is a top job and will run at',
  597. max_attempts=10, starttime=t)
  598. t = int(time.time())
  599. self.logger.info("Waiting for 10 min to poweron 1st node")
  600. time.sleep(600)
  601. self.server.log_match(
  602. "/opt/cray/capmc/default/bin/capmc node_on", starttime=t)
  603. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  604. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[0])
  605. @timeout(900)
  606. def test_power_on_ramp_rate_nodes(self):
  607. """
  608. Test when both ramp rate and power on off is enabled,
  609. power_on_off_enable will override and nodes will be powered off
  610. and powered on.
  611. """
  612. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  613. self.server.manager(MGR_CMD_SET, NODE, {
  614. "poweroff_eligible": True}, self.names[0])
  615. a = {"power_on_off_enable": True,
  616. "power_ramp_rate_enable": True, 'node_idle_limit': '30'}
  617. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  618. a = {'freq': 30}
  619. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  620. sudo=True)
  621. a = {'enabled': 'True'}
  622. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  623. sudo=True)
  624. t = int(time.time())
  625. self.logger.info("Waiting for 1 min to poweroff 1st node")
  626. time.sleep(60)
  627. self.server.log_match(
  628. "power_on_off_enable is over-riding power_ramp_rate_enable",
  629. starttime=t)
  630. self.server.log_match(
  631. "/opt/cray/capmc/default/bin/capmc node_off", starttime=t)
  632. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[0])
  633. a = {"node_idle_limit": "1800", 'min_node_down_delay': '30'}
  634. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  635. t = int(time.time())
  636. jid = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  637. self.scheduler.log_match(
  638. jid + ';Job is a top job and will run at',
  639. max_attempts=10, starttime=t)
  640. t = int(time.time())
  641. self.logger.info("Waiting for 10 min to poweron 1st node")
  642. time.sleep(600)
  643. self.server.log_match(
  644. "/opt/cray/capmc/default/bin/capmc node_on", starttime=t)
  645. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  646. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[0])
  647. @timeout(1200)
  648. def test_power_on_min_node_down_delay(self):
  649. """
  650. Test when a job is calandered on a vnode which is in sleep state,
  651. the node will be not be powered on until min_node_down_delay time.
  652. """
  653. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  654. self.server.manager(MGR_CMD_SET, NODE, {
  655. "poweroff_eligible": True}, self.names[0])
  656. a = {"power_on_off_enable": True, 'min_node_down_delay': '3000'}
  657. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  658. a = {'freq': 30}
  659. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  660. sudo=True)
  661. a = {'enabled': 'True'}
  662. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  663. sudo=True)
  664. self.logger.info("Waiting for 1 min to poweroff 1st node")
  665. time.sleep(60)
  666. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[0])
  667. jid = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  668. t = int(time.time())
  669. self.scheduler.log_match(
  670. jid + ';Job is a top job and will run at',
  671. max_attempts=10, starttime=t)
  672. self.logger.info("Waiting for 2 mins to make sure node is not powered")
  673. time.sleep(120)
  674. self.server.expect(JOB, {'job_state': 'Q'}, id=jid)
  675. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[0])
  676. self.cleanup_power_on()
  677. @timeout(1800)
  678. def test_max_jobs_analyze_limit(self):
  679. """
  680. Test that even when 4 jobs are calandered only nodes assigned
  681. to max_jobs_analyze_limit number of jobs will be considered
  682. for powering on.
  683. """
  684. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  685. self.server.manager(MGR_CMD_SET, SERVER, {'backfill_depth': '4'})
  686. for n in self.server.status(NODE):
  687. if 'resources_available.PBScraynid' in n:
  688. self.server.manager(MGR_CMD_SET, NODE, {
  689. "poweroff_eligible": True}, n['id'])
  690. a = {"power_on_off_enable": True, 'max_jobs_analyze_limit': '2',
  691. 'node_idle_limit': '30', 'max_concurrent_nodes': '30'}
  692. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  693. a = {'freq': 30}
  694. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  695. sudo=True)
  696. a = {'enabled': 'True'}
  697. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  698. sudo=True)
  699. self.logger.info("Waiting for 2 mins to poweroff all the nodes")
  700. time.sleep(120)
  701. # Expect sleep state on all nodes expect login node
  702. self.server.expect(
  703. NODE, {'state=sleep': len(self.server.status(NODE)) - 1})
  704. a = {"node_idle_limit": "1800", 'min_node_down_delay': '30'}
  705. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  706. j1id = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  707. j2id = self.submit_job(1000, {'Resource_List.vnode': self.names[1]})
  708. j3id = self.submit_job(1000, {'Resource_List.vnode': self.names[2]})
  709. j4id = self.submit_job(1000, {'Resource_List.vnode': self.names[3]})
  710. self.logger.info(
  711. "Waiting for 10 mins to poweron the nodes which are calandered")
  712. time.sleep(600)
  713. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  714. self.server.expect(JOB, {'job_state': 'R'}, id=j1id)
  715. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[0])
  716. self.server.expect(JOB, {'job_state': 'R'}, id=j2id)
  717. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[1])
  718. self.server.expect(JOB, {'job_state': 'Q'}, id=j3id)
  719. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[2])
  720. self.server.expect(JOB, {'job_state': 'Q'}, id=j4id)
  721. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[3])
  722. self.cleanup_power_on()
  723. def test_last_used_time_node_sort_key(self):
  724. """
  725. Test last_used_time as node sort key.
  726. """
  727. self.server.manager(MGR_CMD_SET, SERVER, {
  728. 'job_history_enable': 'True'})
  729. i = 0
  730. for n in self.server.status(NODE):
  731. if 'resources_available.PBScraynid' in n:
  732. if i > 1:
  733. self.server.manager(MGR_CMD_SET, NODE, {
  734. 'state': 'offline'}, id=n['id'])
  735. i += 1
  736. a = {'node_sort_key': '"last_used_time LOW" ALL'}
  737. self.scheduler.set_sched_config(a)
  738. jid = self.submit_job(
  739. 1, {'Resource_List.select': '1:ncpus=1',
  740. 'Resource_List.place': 'excl'})
  741. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  742. status = self.server.status(JOB, 'exec_vnode', id=jid, extend='x')
  743. exec_vnode = status[0]['exec_vnode']
  744. node1 = exec_vnode.split(':')[0][1:]
  745. jid = self.submit_job(
  746. 1, {'Resource_List.select': '1:ncpus=1',
  747. 'Resource_List.place': 'excl'})
  748. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  749. jid = self.submit_job(
  750. 1, {'Resource_List.select': '1:ncpus=1',
  751. 'Resource_List.place': 'excl'})
  752. self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x')
  753. status = self.server.status(JOB, 'exec_vnode', id=jid, extend='x')
  754. exec_vnode = status[0]['exec_vnode']
  755. node2 = exec_vnode.split(':')[0][1:]
  756. # Check that 3rd job falls on the same node as 1st job as per
  757. # node_sort_key. Node on which 1st job ran has lower last_used_time
  758. # than the node on which 2nd job ran.
  759. self.assertEqual(node1, node2)
  760. @timeout(1200)
  761. def test_power_ramp_down_nodes(self):
  762. """
  763. Test power hook will ramp down the nodes if power_ramp_rate_enable
  764. is enabled and node_idle_limit is reached.
  765. """
  766. self.setup_power_ramp_rate()
  767. a = {"power_ramp_rate_enable": True, 'node_idle_limit': '30'}
  768. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  769. a = {'freq': 60}
  770. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  771. sudo=True)
  772. a = {'enabled': 'True'}
  773. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  774. sudo=True)
  775. self.logger.info("Waiting for 15 mins to ramp down all the nodes")
  776. time.sleep(900)
  777. # Do not check for the offline nodes and 1 login node
  778. nn = self.offnodes + 1
  779. self.server.expect(
  780. NODE, {'state=sleep': len(self.server.status(NODE)) - nn})
  781. self.cleanup_power_ramp_rate()
  782. @timeout(1000)
  783. def test_power_ramp_down_max_concurennt_nodes(self):
  784. """
  785. Test power hook will ramp down only the number of
  786. max_concurrent nodes specified in conf file per hook run.
  787. """
  788. self.setup_power_ramp_rate()
  789. a = {"power_ramp_rate_enable": True, 'node_idle_limit': '10',
  790. 'max_concurrent_nodes': '2'}
  791. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  792. a = {'freq': 60}
  793. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  794. sudo=True)
  795. a = {'enabled': 'True'}
  796. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  797. sudo=True)
  798. self.logger.info("Waiting for 90 secs to ramp down 2 nodes")
  799. time.sleep(90)
  800. a = {'enabled': 'False'}
  801. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  802. sudo=True)
  803. self.server.expect(NODE, {'state=sleep': 2})
  804. self.cleanup_power_ramp_rate()
  805. @timeout(1500)
  806. def test_power_ramp_up_nodes(self):
  807. """
  808. Test when a job is calandered on a vnode which is in sleep state,
  809. the node will be ramped up and job will run.
  810. """
  811. self.setup_power_ramp_rate()
  812. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  813. a = {"power_ramp_rate_enable": True, 'node_idle_limit': '30'}
  814. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  815. a = {'freq': 60}
  816. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  817. sudo=True)
  818. a = {'enabled': 'True'}
  819. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  820. sudo=True)
  821. self.logger.info("Waiting for 15 mins to ramp down all the nodes")
  822. time.sleep(900)
  823. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[0])
  824. a = {"node_idle_limit": "1800", 'min_node_down_delay': '30'}
  825. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  826. t = int(time.time())
  827. jid = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  828. self.scheduler.log_match(
  829. jid + ';Job is a top job and will run at',
  830. max_attempts=10, starttime=t)
  831. self.logger.info("Waiting for 90 secs to ramp up the calandered node")
  832. time.sleep(90)
  833. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  834. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[0])
  835. self.cleanup_power_ramp_rate()
  836. @timeout(1200)
  837. def test_max_jobs_analyze_limit_ramp_up(self):
  838. """
  839. Test that even when 4 jobs are calandered only nodes assigned
  840. to max_jobs_analyze_limit number of jobs will be considered
  841. for ramping up.
  842. """
  843. self.setup_power_ramp_rate()
  844. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  845. self.server.manager(MGR_CMD_SET, SERVER, {'backfill_depth': '4'})
  846. a = {"power_ramp_rate_enable": True,
  847. 'max_jobs_analyze_limit': '2', 'node_idle_limit': '30'}
  848. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  849. a = {'freq': 60}
  850. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  851. sudo=True)
  852. a = {'enabled': 'True'}
  853. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  854. sudo=True)
  855. self.logger.info("Waiting for 15 mins to ramp down all the nodes")
  856. time.sleep(900)
  857. # Do not check for the offline nodes and 1 login node
  858. nn = self.offnodes + 1
  859. self.server.expect(
  860. NODE, {'state=sleep': len(self.server.status(NODE)) - nn})
  861. a = {"node_idle_limit": "1800", 'min_node_down_delay': '30'}
  862. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  863. j1id = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  864. j2id = self.submit_job(1000, {'Resource_List.vnode': self.names[1]})
  865. j3id = self.submit_job(1000, {'Resource_List.vnode': self.names[2]})
  866. j4id = self.submit_job(1000, {'Resource_List.vnode': self.names[3]})
  867. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  868. self.logger.info("Waiting for 90 secs to ramp up the calandered nodes")
  869. time.sleep(90)
  870. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'})
  871. self.server.expect(JOB, {'job_state': 'R'}, id=j1id)
  872. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[0])
  873. self.server.expect(JOB, {'job_state': 'R'}, id=j2id)
  874. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[1])
  875. self.server.expect(JOB, {'job_state': 'Q'}, id=j3id)
  876. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[2])
  877. self.server.expect(JOB, {'job_state': 'Q'}, id=j4id)
  878. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[3])
  879. self.cleanup_power_ramp_rate()
  880. @timeout(1200)
  881. def test_power_ramp_up_poweroff_eligible(self):
  882. """
  883. Test that nodes are considered for ramp down and ramp up
  884. even when poweroff_elgible is set to false.
  885. """
  886. self.setup_power_ramp_rate()
  887. self.scheduler.set_sched_config({'strict_ordering': 'True ALL'})
  888. self.server.manager(MGR_CMD_SET, NODE, {'poweroff_eligible': 'False'},
  889. expect=True, id=self.names[0])
  890. a = {"power_ramp_rate_enable": True, 'node_idle_limit': '30'}
  891. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  892. a = {'freq': 60}
  893. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  894. sudo=True)
  895. a = {'enabled': 'True'}
  896. self.server.manager(MGR_CMD_SET, PBS_HOOK, a, id='PBS_power',
  897. sudo=True)
  898. self.logger.info("Waiting for 15 mins to ramp down all the nodes")
  899. time.sleep(900)
  900. self.server.expect(NODE, {'state': 'sleep'}, id=self.names[0])
  901. a = {"node_idle_limit": "1800", 'min_node_down_delay': '30'}
  902. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  903. t = int(time.time())
  904. jid = self.submit_job(1000, {'Resource_List.vnode': self.names[0]})
  905. self.scheduler.log_match(
  906. jid + ';Job is a top job and will run at',
  907. max_attempts=10, starttime=t)
  908. self.logger.info("Waiting for 90 secs to ramp up calandered node")
  909. time.sleep(90)
  910. self.server.expect(JOB, {'job_state': 'R'}, id=jid)
  911. self.server.expect(NODE, {'state': 'job-exclusive'}, id=self.names[0])
  912. self.cleanup_power_ramp_rate()
  913. def tearDown(self):
  914. a = {"power_ramp_rate_enable": False,
  915. "power_on_off_enable": False,
  916. 'node_idle_limit': '1800',
  917. 'min_node_down_delay': '1800',
  918. "max_jobs_analyze_limit": "100",
  919. "max_concurrent_nodes": "5"}
  920. self.modify_hook_config(attrs=a, hook_id='PBS_power')
  921. self.disable_power()
  922. TestFunctional.tearDown(self)