pbs_cpuset.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. import time
  37. from tests.functional import *
  38. class TestPbsCpuset(TestFunctional):
  39. """
  40. This tests the Reliable Job Startup Feature with cpuset mom.
  41. A job can be started with extra nodes with node failures tolerated
  42. during job start. Released cpuset resources can be reused by another job.
  43. """
  44. def setUp(self):
  45. if not self.mom.is_cpuset_mom():
  46. self.skipTest("Test suite only meant to run with cpuset mom.")
  47. TestFunctional.setUp(self)
  48. # launch hook
  49. self.launch_hook_body = """
  50. import pbs
  51. import time
  52. e=pbs.event()
  53. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  54. # print out the vnode_list[] values
  55. for vn in e.vnode_list:
  56. v = e.vnode_list[vn]
  57. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  58. # print out the vnode_list_fail[] values:
  59. for vn in e.vnode_list_fail:
  60. v = e.vnode_list_fail[vn]
  61. pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
  62. if e.job.in_ms_mom():
  63. pj = e.job.release_nodes(keep_select="ncpus=1:mem=2gb")
  64. if pj is None:
  65. e.job.Hold_Types = pbs.hold_types("s")
  66. e.job.rerun()
  67. e.reject("unsuccessful at LAUNCH")
  68. pbs.logmsg(pbs.LOG_DEBUG, "Sleeping for 20sec")
  69. time.sleep(20)
  70. """
  71. def test_reliable_job_startup_on_cpuset(self):
  72. """
  73. A job is started with two numa nodes and goes in R state.
  74. An execjob_launch hook will force job to have only one numa node.
  75. The released numa node can be used in another job.
  76. """
  77. # instantiate execjob_launch hook
  78. hook_event = "execjob_launch"
  79. hook_name = "launch"
  80. a = {'event': hook_event, 'enabled': 'true'}
  81. stime = int(time.time())
  82. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  83. # Check mom logs that the launch hook got propagated
  84. msg = "Hook;launch.PY;copy hook-related file request received"
  85. self.mom.log_match(msg, starttime=stime, interval=2, max_attempts=60)
  86. # Submit job1
  87. j = Job(TEST_USER, {
  88. ATTR_l + '.select': '2:ncpus=1:mem=2gb',
  89. ATTR_l + '.place': 'vscatter',
  90. ATTR_W: 'tolerate_node_failures=job_start'})
  91. stime = int(time.time())
  92. jid = self.server.submit(j)
  93. # Check the exec_vnode while in substate 41
  94. self.server.expect(JOB, {ATTR_substate: '41'}, id=jid)
  95. self.server.expect(JOB, 'exec_vnode', id=jid, op=SET)
  96. job_stat = self.server.status(JOB, id=jid)
  97. execvnode1 = job_stat[0]['exec_vnode']
  98. self.logger.info("initial exec_vnode: %s" % execvnode1)
  99. initial_vnodes = execvnode1.split('+')
  100. # Check the exec_vnode after job is in substate 42
  101. self.server.expect(JOB, {ATTR_substate: '42'}, offset=20, id=jid)
  102. self.server.expect(JOB, 'exec_vnode', id=jid, op=SET)
  103. job_stat = self.server.status(JOB, id=jid)
  104. execvnode2 = job_stat[0]['exec_vnode']
  105. self.logger.info("pruned exec_vnode: %s" % execvnode2)
  106. # Check for msg in mom logs indicating the job has cpuset
  107. msg = "new_cpuset: setting altid to CPU set named /PBSPro/%s" % jid
  108. self.mom.log_match(msg, starttime=stime)
  109. # Check mom logs for pruned from and pruned to messages
  110. self.mom.log_match("Job;%s;pruned from exec_vnode=%s" % (
  111. jid, execvnode1), starttime=stime)
  112. self.mom.log_match("Job;%s;pruned to exec_vnode=%s" % (
  113. jid, execvnode2), starttime=stime)
  114. # Find out the released vnode
  115. if initial_vnodes[0] == execvnode2:
  116. execvnodeB = initial_vnodes[1]
  117. else:
  118. execvnodeB = initial_vnodes[0]
  119. vnodeB = execvnodeB.split(':')[0].split('(')[1]
  120. self.logger.info("released vnode: %s" % vnodeB)
  121. # Submit job2 requesting the released vnode, job runs
  122. j2 = Job(TEST_USER, {
  123. ATTR_l + '.select': '1:ncpus=1:mem=2gb:vnode=%s' % vnodeB})
  124. stime = int(time.time())
  125. jid2 = self.server.submit(j2)
  126. self.server.expect(JOB, {ATTR_state: 'R'}, offset=20, id=jid2)
  127. # Check for msg in mom logs indicating job has cpuset
  128. msg2 = "new_cpuset: setting altid to CPU set named /PBSPro/%s" % jid2
  129. self.mom.log_match(msg2, starttime=stime)
  130. # Check if exec_vnode for job2 matches released vnode from job1
  131. self.server.expect(JOB, 'exec_vnode', id=jid2, op=SET)
  132. job_stat = self.server.status(JOB, id=jid2)
  133. execvnode3 = job_stat[0]['exec_vnode']
  134. self.assertEqual(execvnode3, execvnodeB)
  135. self.logger.info("job2 exec_vnode %s is the released vnode %s" % (
  136. execvnode3, execvnodeB))