pbs_qrun.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. import os
  38. import signal
  39. class TestQrun(TestFunctional):
  40. def setUp(self):
  41. TestFunctional.setUp(self)
  42. # set ncpus to a known value, 2 here
  43. a = {'resources_available.ncpus': 2}
  44. self.server.manager(MGR_CMD_SET, NODE, a,
  45. self.mom.shortname, expect=True)
  46. self.pbs_exec = self.server.pbs_conf['PBS_EXEC']
  47. self.qrun = os.path.join(self.pbs_exec, 'bin', 'qrun')
  48. def test_invalid_host_val(self):
  49. """
  50. Tests that pbs_server should not crash when the node list in
  51. qrun is ill-formed
  52. """
  53. j1 = Job(TEST_USER)
  54. # submit a multi-chunk job
  55. j1 = Job(attrs={'Resource_List.select':
  56. 'ncpus=2:host=%s+ncpus=2:host=%s' %
  57. (self.mom.shortname, self.mom.shortname)})
  58. jid1 = self.server.submit(j1)
  59. self.server.expect(JOB, {ATTR_state: 'Q'}, jid1)
  60. exec_vnode = '"\'(%s)+(%s)\'"' % \
  61. (self.mom.shortname, self.mom.shortname)
  62. err_msg = 'qrun: Unknown node "\'(%s)+(%s)\'"' % \
  63. (self.mom.shortname, self.mom.shortname)
  64. try:
  65. self.server.runjob(jobid=jid1, location=exec_vnode)
  66. except PbsRunError as e:
  67. self.assertIn(err_msg, e.msg[0])
  68. self.logger.info('As expected qrun throws error: ' + err_msg)
  69. else:
  70. msg = "Able to run job successfully"
  71. self.assertTrue(False, msg)
  72. msg = "Server is not up"
  73. self.assertTrue(self.server.isUp(), msg)
  74. self.logger.info("As expected server is up and running")
  75. j2 = Job(TEST_USER)
  76. # submit a sleep job
  77. j2 = Job(attrs={'Resource_List.select': 'ncpus=3'})
  78. jid2 = self.server.submit(j2)
  79. self.server.expect(JOB, {ATTR_state: 'Q'}, jid2)
  80. try:
  81. self.server.runjob(jobid=jid2, location=exec_vnode)
  82. except PbsRunError as e:
  83. self.assertIn(err_msg, e.msg[0])
  84. self.logger.info('As expected qrun throws error: ' + err_msg)
  85. else:
  86. msg = "Able to run job successfully"
  87. self.assertTrue(False, msg)
  88. msg = "Server is not up"
  89. self.assertTrue(self.server.isUp(), msg)
  90. self.logger.info("As expected server is up and running")
  91. def test_qrun_hangs(self):
  92. """
  93. This test submit 500 jobs with differnt equivalence class,
  94. turn of scheduling and qrun job to
  95. verify whether qrun hangs.
  96. """
  97. node = self.mom.shortname
  98. self.server.manager(MGR_CMD_SET, SCHED,
  99. {'scheduling': 'False'})
  100. self.server.manager(MGR_CMD_SET, NODE,
  101. {'resources_available.ncpus': 1}, id=node)
  102. for walltime in range(1, 501):
  103. j = Job(TEST_USER)
  104. a = {'Resource_List.walltime': walltime}
  105. j.set_attributes(a)
  106. if walltime == 500:
  107. jid = self.server.submit(j)
  108. else:
  109. self.server.submit(j)
  110. self.logger.info("Submitted 500 jobs with different walltime")
  111. self.server.manager(MGR_CMD_SET, SERVER,
  112. {'scheduling': 'True'})
  113. self.server.manager(MGR_CMD_SET, SERVER,
  114. {'scheduling': 'False'}, expect=True)
  115. time.sleep(1)
  116. now = int(time.time())
  117. pid = os.fork()
  118. if pid == 0:
  119. try:
  120. self.server.runjob(jobid=jid)
  121. self.logger.info("Successfully runjob. Child process exit.")
  122. except PbsRunError as e:
  123. self.logger.info("Runjob throws error: " + e.msg[0])
  124. else:
  125. try:
  126. self.scheduler.log_match("Starting Scheduling Cycle",
  127. interval=5, starttime=now,
  128. max_attempts=10)
  129. self.logger.info("No hangs. Parent process exit")
  130. except PtlLogMatchError:
  131. os.kill(pid, signal.SIGKILL)
  132. os.waitpid(pid, 0)
  133. self.logger.info("Runjob hung. Child process exit.")
  134. self.fail("Qrun didn't start another sched cycle")