pbs_cray_reliable_job_startup.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. import time
  37. import fnmatch
  38. from tests.functional import *
  39. from ptl.utils.pbs_logutils import PBSLogUtils
  40. @tags('cray')
  41. class TestPbsReliableJobStartupOnCray(TestFunctional):
  42. """
  43. This tests the Reliable Job Startup Feature on Cray.
  44. A job can be started with extra nodes with node failures tolerated
  45. during job start but setting is not supported and ignored on Cray.
  46. """
  47. def setUp(self):
  48. if not self.du.get_platform().startswith('cray'):
  49. self.skipTest("Test suite only meant to run on a Cray")
  50. TestFunctional.setUp(self)
  51. # queuejob hook
  52. self.qjob_hook_body = """
  53. import pbs
  54. e=pbs.event()
  55. pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
  56. # Save current select spec in resource 'site'
  57. e.job.Resource_List["site"] = str(e.job.Resource_List["select"])
  58. new_select = e.job.Resource_List["select"].increment_chunks(1)
  59. e.job.Resource_List["select"] = new_select
  60. e.job.tolerate_node_failures = "job_start"
  61. """
  62. # prologue hook
  63. self.prolo_hook_body = """
  64. import pbs
  65. e=pbs.event()
  66. pbs.logmsg(pbs.LOG_DEBUG, "Executing prologue")
  67. # print out the vnode_list[] values
  68. for vn in e.vnode_list:
  69. v = e.vnode_list[vn]
  70. pbs.logjobmsg(e.job.id, "prologue: found vnode_list[" + v.name + "]")
  71. # print out the vnode_list_fail[] values
  72. for vn in e.vnode_list_fail:
  73. v = e.vnode_list_fail[vn]
  74. pbs.logjobmsg(e.job.id, "prologue: found vnode_list_fail[" + v.name + "]")
  75. if e.job.in_ms_mom():
  76. pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
  77. if pj is None:
  78. e.job.Hold_Types = pbs.hold_types("s")
  79. e.job.rerun()
  80. e.reject("unsuccessful at PROLOGUE")
  81. """
  82. # launch hook
  83. self.launch_hook_body = """
  84. import pbs
  85. e=pbs.event()
  86. if 'PBS_NODEFILE' not in e.env:
  87. e.accept()
  88. pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")
  89. # print out the vnode_list[] values
  90. for vn in e.vnode_list:
  91. v = e.vnode_list[vn]
  92. pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")
  93. # print out the vnode_list_fail[] values:
  94. for vn in e.vnode_list_fail:
  95. v = e.vnode_list_fail[vn]
  96. pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
  97. v.state = pbs.ND_OFFLINE
  98. if e.job.in_ms_mom():
  99. pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
  100. if pj is None:
  101. e.job.Hold_Types = pbs.hold_types("s")
  102. e.job.rerun()
  103. e.reject("unsuccessful at LAUNCH")
  104. """
  105. def match_str_in_input_file(self, file_path, file_pattern, search_str):
  106. """
  107. Assert that search string appears in the input file
  108. that matches file_pattern
  109. """
  110. input_file = None
  111. for item in self.du.listdir(path=file_path, sudo=True):
  112. if fnmatch.fnmatch(item, file_pattern):
  113. input_file = item
  114. break
  115. self.assertTrue(input_file is not None)
  116. with PBSLogUtils().open_log(input_file, sudo=True) as f:
  117. self.assertTrue(search_str in f.read())
  118. self.logger.info("Found \"%s\" in %s" % (search_str, input_file))
  119. @tags('cray')
  120. def test_reliable_job_startup_not_supported_on_cray(self):
  121. """
  122. A job is started with extra nodes. Mom superior will show no sign
  123. of tolerating node failure. Accounting logs won't have 's' record.
  124. Input files to prologue and launch hooks will show the
  125. tolerate_node_failures=none value.
  126. """
  127. # instantiate queuejob hook
  128. hook_event = 'queuejob'
  129. hook_name = 'qjob'
  130. a = {'event': hook_event, 'enabled': 'true'}
  131. self.server.create_import_hook(hook_name, a, self.qjob_hook_body)
  132. # instantiate execjob_prologue hook
  133. hook_event = 'execjob_prologue'
  134. hook_name = 'prolo'
  135. a = {'event': hook_event, 'enabled': 'true'}
  136. self.server.create_import_hook(hook_name, a, self.prolo_hook_body)
  137. # instantiate execjob_launch hook
  138. hook_event = 'execjob_launch'
  139. hook_name = 'launch'
  140. a = {'event': hook_event, 'enabled': 'true'}
  141. self.server.create_import_hook(hook_name, a, self.launch_hook_body)
  142. # Submit a job
  143. j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=3:mem=2gb:vntype=' +
  144. 'cray_compute+1:ncpus=3:mem=2gb:vntype=' +
  145. 'cray_compute',
  146. ATTR_l + '.place': 'scatter'})
  147. start_time = int(time.time())
  148. jid = self.server.submit(j)
  149. self.server.expect(JOB, {ATTR_state: 'R'}, id=jid)
  150. # Check for msg in mom superior logs
  151. msg = "no nodes released as job does not tolerate node failures"
  152. self.server.expect(JOB, 'exec_host', id=jid, op=SET)
  153. job_stat = self.server.status(JOB, id=jid)
  154. exechost = job_stat[0]['exec_host'].partition('/')[0]
  155. mom_superior = self.moms[exechost]
  156. mom_superior.log_match(msg, starttime=start_time)
  157. # Check that 's' record is absent since release_nodes() was not called
  158. self.server.accounting_match(
  159. msg=".*%s;%s;.*" % ('s', jid),
  160. regexp=True, n=50, max_attempts=10, existence=False)
  161. self.logger.info(
  162. "There was no 's' record found for job %s, test passes" % jid)
  163. # On mom superior check the input files to prologue and launch hooks
  164. # showed the tolerate_node_failures=none value
  165. search_str = 'pbs.event().job.tolerate_node_failures=none'
  166. self.mom_hooks_tmp_dir = os.path.join(
  167. self.server.pbs_conf['PBS_HOME'], 'mom_priv', 'hooks', 'tmp')
  168. hook_name = 'prolo'
  169. input_file_pattern = os.path.join(
  170. self.mom_hooks_tmp_dir, 'hook_execjob_prologue_%s*.in' % hook_name)
  171. self.match_str_in_input_file(
  172. self.mom_hooks_tmp_dir, input_file_pattern, search_str)
  173. hook_name = 'launch'
  174. input_file_pattern = os.path.join(
  175. self.mom_hooks_tmp_dir, 'hook_execjob_launch_%s*.in' % hook_name)
  176. self.match_str_in_input_file(
  177. self.mom_hooks_tmp_dir, input_file_pattern, search_str)