pbs_checkpoint.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. from ptl.utils.pbs_crayutils import CrayUtils
  38. class TestCheckpoint(TestFunctional):
  39. """
  40. This test suite targets Checkpoint functionality.
  41. """
  42. abort_file = ''
  43. cu = CrayUtils()
  44. def setUp(self):
  45. TestFunctional.setUp(self)
  46. a = {'job_history_enable': 'True'}
  47. self.server.manager(MGR_CMD_SET, SERVER, a)
  48. abort_script = """#!/bin/bash
  49. kill $1
  50. exit 0
  51. """
  52. self.abort_file = self.du.create_temp_file(body=abort_script)
  53. self.du.chmod(path=self.abort_file, mode=0755)
  54. self.du.chown(path=self.abort_file, uid=0, gid=0, runas=ROOT_USER)
  55. c = {'$action': 'checkpoint_abort 30 !' + self.abort_file + ' %sid'}
  56. self.mom.add_config(c)
  57. self.platform = self.du.get_platform()
  58. if self.platform != 'cray' and self.platform != 'craysim':
  59. self.attrs = {ATTR_l + '.select': '1:ncpus=1',
  60. ATTR_l + '.place': 'excl'}
  61. else:
  62. nv = self.cu.num_compute_vnodes(self.server)
  63. self.assertNotEqual(nv, 0, "No cray_compute vnodes are present.")
  64. self.attrs = {ATTR_l + '.select': '%d:ncpus=1' % nv,
  65. ATTR_l + '.place': 'scatter'}
  66. def verify_checkpoint_abort(self, jid, stime):
  67. """
  68. Verify that checkpoint and abort happened.
  69. """
  70. self.ck_dir = os.path.join(self.server.pbs_conf['PBS_HOME'],
  71. 'checkpoint', jid + '.CK')
  72. self.assertTrue(self.du.isdir(path=self.ck_dir, runas=ROOT_USER),
  73. msg="Checkpoint directory %s not found" % self.ck_dir)
  74. _msg1 = "%s;req_holdjob: Checkpoint initiated." % jid
  75. self.mom.log_match(_msg1, starttime=stime)
  76. _msg2 = "%s;checkpoint_abort script %s: exit code 0" % (
  77. jid, self.abort_file)
  78. self.mom.log_match(_msg2, starttime=stime)
  79. _msg3 = "%s;checkpointed to %s" % (jid, self.ck_dir)
  80. self.mom.log_match(_msg3, starttime=stime)
  81. _msg4 = "%s;task 00000001 terminated" % jid
  82. self.mom.log_match(_msg4, starttime=stime)
  83. def start_server_hot(self):
  84. """
  85. Start the server with the hot option.
  86. """
  87. pbs_exec = self.server.pbs_conf['PBS_EXEC']
  88. svrname = self.server.pbs_server_name
  89. pbs_server_hot = [os.path.join(
  90. pbs_exec, 'sbin', 'pbs_server'), '-t', 'hot']
  91. self.du.run_cmd(svrname, cmd=pbs_server_hot, sudo=True)
  92. self.assertTrue(self.server.isUp())
  93. def checkpoint_abort_with_qterm_restart_hot(self, qterm_type):
  94. """
  95. Checkpointing with qterm -t <type>, hot server restart.
  96. """
  97. j1 = Job(TEST_USER, self.attrs)
  98. j1.set_sleep_time(20)
  99. jid1 = self.server.submit(j1)
  100. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  101. start_time = int(time.time())
  102. self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'})
  103. self.server.qterm(manner=qterm_type)
  104. self.verify_checkpoint_abort(jid1, start_time)
  105. self.start_server_hot()
  106. self.assertTrue(self.server.isUp())
  107. msg = "%s;Requeueing job, substate: 10 Requeued in queue: workq" % jid1
  108. self.server.log_match(msg, starttime=start_time)
  109. # wait for the server to hot start the job
  110. self.server.expect(JOB, {'job_state': 'R'}, id=jid1, interval=2)
  111. self.server.expect(JOB, 'exec_vnode', id=jid1, op=SET)
  112. self.assertFalse(os.path.exists(self.ck_dir),
  113. msg=self.ck_dir + " still exists")
  114. self.server.expect(JOB, {'job_state': 'F'},
  115. jid1, extend='x', interval=5)
  116. def test_checkpoint_abort_with_preempt(self):
  117. """
  118. This test verifies that checkpoint_abort works as expected when
  119. a job is preempted via checkpoint. It does so by submitting a job
  120. in express queue which preempts a running job in the default queue.
  121. """
  122. self.scheduler.set_sched_config({'preempt_order': 'C'})
  123. a = {'queue_type': 'execution',
  124. 'started': 'True',
  125. 'enabled': 'True',
  126. 'Priority': 200}
  127. self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq")
  128. j1 = Job(TEST_USER, self.attrs)
  129. j1.set_sleep_time(20)
  130. jid1 = self.server.submit(j1)
  131. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  132. self.attrs['queue'] = 'expressq'
  133. j2 = Job(TEST_USER, self.attrs)
  134. j2.set_sleep_time(20)
  135. start_time = int(time.time())
  136. jid2 = self.server.submit(j2)
  137. self.server.expect(JOB, {'job_state': 'R'}, id=jid2)
  138. self.server.expect(JOB, {'job_state': 'Q'}, id=jid1)
  139. self.verify_checkpoint_abort(jid1, start_time)
  140. self.server.expect(JOB, {'job_state': 'F'},
  141. jid2, extend='x', interval=5)
  142. self.server.expect(JOB, {'job_state': 'F'},
  143. jid1, extend='x', interval=5)
  144. def test_checkpoint_abort_with_qhold(self):
  145. """
  146. This test uses qhold for checkpointing.
  147. """
  148. j1 = Job(TEST_USER, self.attrs)
  149. jid1 = self.server.submit(j1)
  150. self.server.expect(JOB, {'job_state': 'R'}, id=jid1)
  151. start_time = int(time.time())
  152. self.server.holdjob(jid1)
  153. self.server.expect(JOB, {'job_state': 'H'}, id=jid1)
  154. self.verify_checkpoint_abort(jid1, start_time)
  155. def test_checkpoint_abort_with_qterm_immediate_restart_hot(self):
  156. """
  157. This tests checkpointing with qterm -t immediate, hot server restart.
  158. """
  159. self.checkpoint_abort_with_qterm_restart_hot("immediate")
  160. def test_checkpoint_abort_with_qterm_delay_restart_hot(self):
  161. """
  162. This tests checkpointing with qterm -t delay, hot server restart.
  163. """
  164. self.checkpoint_abort_with_qterm_restart_hot("delay")
  165. def tearDown(self):
  166. TestFunctional.tearDown(self)
  167. try:
  168. os.remove(self.abort_file)
  169. except OSError:
  170. pass