pbs_hook_alarm_large_multinode_job.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.resilience import *
  37. from time import sleep
  38. class TestPbsHookAlarmLargeMultinodeJob(TestResilience):
  39. """
  40. This test suite contains hooks test to verify that a large
  41. multi-node job does not slow down hook execution and cause an alarm.
  42. """
  43. def setUp(self):
  44. TestResilience.setUp(self)
  45. # Increasing the daemon log for debugging
  46. self.server.manager(MGR_CMD_SET, SERVER, {"log_events": '2047'})
  47. self.mom.add_config({"$logevent": "0xfffffff"})
  48. a = {'resources_available.mem': '1gb',
  49. 'resources_available.ncpus': '1'}
  50. self.server.create_vnodes(self.mom.shortname, a, 5000, self.mom)
  51. # Restart mom explicitly due to PP-993
  52. self.mom.restart()
  53. @timeout(400)
  54. def test_begin_hook(self):
  55. """
  56. Create an execjob_begin hook, import a hook content with a small
  57. alarm value, and test it against a large multi-node job.
  58. """
  59. hook_name = "beginhook"
  60. hook_event = "execjob_begin"
  61. hook_body = """
  62. import pbs
  63. e=pbs.event()
  64. pbs.logmsg(pbs.LOG_DEBUG, "executing begin hook %s" % (e.hook_name,))
  65. """
  66. a = {'event': hook_event, 'enabled': 'True',
  67. 'alarm': '15'}
  68. self.server.create_import_hook(hook_name, a, hook_body)
  69. j = Job(TEST_USER)
  70. a = {'Resource_List.select': '5000:ncpus=1:mem=1gb',
  71. 'Resource_List.walltime': 10}
  72. j.set_attributes(a)
  73. jid = self.server.submit(j)
  74. self.server.expect(JOB, {'job_state': 'R'},
  75. jid, max_attempts=15, interval=2)
  76. self.mom.log_match(
  77. "pbs_python;executing begin hook %s" % (hook_name,), n=100,
  78. max_attempts=5, interval=5, regexp=True)
  79. self.mom.log_match(
  80. "Job;%s;alarm call while running %s hook" % (jid, hook_event),
  81. n=100, max_attempts=5, interval=5, regexp=True, existence=False)
  82. self.mom.log_match("Job;%s;Started, pid" % (jid,), n=100,
  83. max_attempts=5, interval=5, regexp=True)
  84. @timeout(400)
  85. def test_prolo_hook(self):
  86. """
  87. Create an execjob_prologue hook, import a hook content with a
  88. small alarm value, and test it against a large multi-node job.
  89. """
  90. hook_name = "prolohook"
  91. hook_event = "execjob_prologue"
  92. hook_body = """
  93. import pbs
  94. e=pbs.event()
  95. pbs.logmsg(pbs.LOG_DEBUG, "executing prologue hook %s" % (e.hook_name,))
  96. """
  97. a = {'event': hook_event, 'enabled': 'True',
  98. 'alarm': '15'}
  99. self.server.create_import_hook(hook_name, a, hook_body)
  100. j = Job(TEST_USER)
  101. a = {'Resource_List.select': '5000:ncpus=1:mem=1gb',
  102. 'Resource_List.walltime': 10}
  103. j.set_attributes(a)
  104. jid = self.server.submit(j)
  105. self.server.expect(JOB, {'job_state': 'R'},
  106. jid, max_attempts=15, interval=2)
  107. self.mom.log_match(
  108. "pbs_python;executing prologue hook %s" % (hook_name,), n=100,
  109. max_attempts=5, interval=5, regexp=True)
  110. self.mom.log_match(
  111. "Job;%s;alarm call while running %s hook" % (jid, hook_event),
  112. n=100, max_attempts=5, interval=5, regexp=True, existence=False)
  113. @timeout(400)
  114. def test_epi_hook(self):
  115. """
  116. Create an execjob_epilogue hook, import a hook content with a small
  117. alarm value, and test it against a large multi-node job.
  118. """
  119. hook_name = "epihook"
  120. hook_event = "execjob_epilogue"
  121. hook_body = """
  122. import pbs
  123. e=pbs.event()
  124. pbs.logmsg(pbs.LOG_DEBUG, "executing epilogue hook %s" % (e.hook_name,))
  125. """
  126. search_after = int(time.time())
  127. a = {'event': hook_event, 'enabled': 'True',
  128. 'alarm': '15'}
  129. self.server.create_import_hook(hook_name, a, hook_body)
  130. j = Job(TEST_USER)
  131. a = {'Resource_List.select': '5000:ncpus=1:mem=1gb'}
  132. j.set_attributes(a)
  133. j.set_sleep_time(10)
  134. jid = self.server.submit(j)
  135. self.server.expect(JOB, {'job_state': 'R'},
  136. jid, max_attempts=15, interval=2)
  137. self.logger.info("Wait 10s for job to finish")
  138. sleep(10)
  139. self.server.log_match("dequeuing from", starttime=search_after)
  140. self.mom.log_match(
  141. "pbs_python;executing epilogue hook %s" % (hook_name,), n=100,
  142. max_attempts=5, interval=5, regexp=True)
  143. self.mom.log_match(
  144. "Job;%s;alarm call while running %s hook" % (jid, hook_event),
  145. n=100, max_attempts=5, interval=5, regexp=True, existence=False)
  146. self.mom.log_match("Job;%s;Obit sent" % (jid,), n=100,
  147. max_attempts=5, interval=5, regexp=True)
  148. @timeout(400)
  149. def test_end_hook(self):
  150. """
  151. Create an execjob_end hook, import a hook content with a small
  152. alarm value, and test it against a large multi-node job.
  153. """
  154. hook_name = "endhook"
  155. hook_event = "execjob_end"
  156. hook_body = """
  157. import pbs
  158. e=pbs.event()
  159. pbs.logmsg(pbs.LOG_DEBUG, "executing end hook %s" % (e.hook_name,))
  160. """
  161. search_after = int(time.time())
  162. a = {'event': hook_event, 'enabled': 'True',
  163. 'alarm': '15'}
  164. self.server.create_import_hook(hook_name, a, hook_body)
  165. j = Job(TEST_USER)
  166. a = {'Resource_List.select': '5000:ncpus=1:mem=1gb'}
  167. j.set_attributes(a)
  168. j.set_sleep_time(10)
  169. jid = self.server.submit(j)
  170. self.server.expect(JOB, {'job_state': 'R'},
  171. jid, max_attempts=15, interval=2)
  172. self.logger.info("Wait 10s for job to finish")
  173. sleep(10)
  174. self.server.log_match("dequeuing from", starttime=search_after)
  175. self.mom.log_match(
  176. "pbs_python;executing end hook %s" % (hook_name,), n=100,
  177. max_attempts=5, interval=5, regexp=True)
  178. self.mom.log_match(
  179. "Job;%s;alarm call while running %s hook" % (jid, hook_event),
  180. n=100, max_attempts=5, interval=5, regexp=True, existence=False)
  181. self.mom.log_match("Job;%s;Obit sent" % (jid,), n=100,
  182. max_attempts=5, interval=5, regexp=True)