pbs_hook_timeout.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. import os
  37. from tests.functional import *
  38. class TestHookTimeout(TestFunctional):
  39. """
  40. Test to make sure hooks are resent to moms that don't ack when
  41. the hooks are sent
  42. """
  43. def setUp(self):
  44. TestFunctional.setUp(self)
  45. if len(self.moms) != 3:
  46. self.skip_test('Test requires 3 moms, use -p <moms>')
  47. self.momA = self.moms.values()[0]
  48. self.momB = self.moms.values()[1]
  49. self.momC = self.moms.values()[2]
  50. self.momA.delete_vnode_defs()
  51. self.momB.delete_vnode_defs()
  52. self.momC.delete_vnode_defs()
  53. self.hostA = self.momA.shortname
  54. self.hostB = self.momB.shortname
  55. self.hostC = self.momC.shortname
  56. self.server.manager(MGR_CMD_DELETE, NODE, None, "")
  57. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA)
  58. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB)
  59. self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC)
  60. self.server.expect(VNODE, {'state=free': 3}, op=EQ, count=True,
  61. max_attempts=10, interval=2)
  62. @timeout(600)
  63. def test_hook_send(self):
  64. """
  65. Test when the server doesn't receive an ACK from a mom for
  66. sending hooks he resends them
  67. """
  68. self.server.manager(MGR_CMD_SET, SERVER, {'log_events': 2047},
  69. expect=True)
  70. timeout_max_attempt = 7
  71. # Make momB unresponsive
  72. self.logger.info("Stopping MomB")
  73. self.momB.signal("-STOP")
  74. start_time = int(time.time())
  75. hook_body = "import pbs\n"
  76. a = {'event': 'execjob_epilogue', 'enabled': 'True'}
  77. rv = self.server.create_import_hook("test", a, hook_body)
  78. self.assertTrue(rv)
  79. # First batch of hook update is for the *.HK files
  80. self.server.log_match(
  81. "Timing out previous send of mom hook updates "
  82. "(send replies expected=3 received=2)", n=600,
  83. max_attempts=timeout_max_attempt, interval=30,
  84. starttime=start_time)
  85. # sent hook control file
  86. for h in [self.hostA, self.hostB, self.hostC]:
  87. hfile = os.path.join(self.server.pbs_conf['PBS_HOME'],
  88. "server_priv", "hooks", "test.HK")
  89. if h != self.hostB:
  90. exist = True
  91. else:
  92. exist = False
  93. self.server.log_match(
  94. ".*successfully sent hook file %s to %s.*" %
  95. (hfile, h), max_attempts=5, interval=1,
  96. regexp=True, existence=exist,
  97. starttime=start_time)
  98. # Second batch of hook update is for the *.PY files + resend of
  99. # *.HK file to momB
  100. self.server.log_match(
  101. "Timing out previous send of mom hook updates "
  102. "(send replies expected=4 received=2)", n=600,
  103. max_attempts=timeout_max_attempt, interval=30,
  104. starttime=start_time)
  105. # sent hook content file
  106. for h in [self.hostA, self.hostB, self.hostC]:
  107. hfile = os.path.join(self.server.pbs_conf['PBS_HOME'],
  108. "server_priv", "hooks", "test.PY")
  109. if h != self.hostB:
  110. exist = True
  111. else:
  112. exist = False
  113. self.server.log_match(
  114. ".*successfully sent hook file %s to %s.*" %
  115. (hfile, h), max_attempts=3, interval=1,
  116. regexp=True, existence=exist,
  117. starttime=start_time)
  118. # Now check to make sure moms have received the hook files
  119. for m in [self.momA, self.momB, self.momC]:
  120. if m != self.momB:
  121. exist = True
  122. else:
  123. exist = False
  124. m.log_match(
  125. "test.HK;copy hook-related file request received",
  126. regexp=True, max_attempts=3, interval=1,
  127. existence=exist, starttime=start_time)
  128. m.log_match(
  129. "test.PY;copy hook-related file request received",
  130. regexp=True, max_attempts=3, interval=1,
  131. existence=exist, starttime=start_time)
  132. # Ensure that hook send updates are retried for
  133. # the *.HK and *.PY file to momB
  134. self.server.log_match(
  135. "Timing out previous send of mom hook updates "
  136. "(send replies expected=2 received=0)", n=600,
  137. max_attempts=timeout_max_attempt, interval=30,
  138. starttime=start_time)
  139. # Submit a job, it should still run
  140. a = {'Resource_List.select': '3:ncpus=1',
  141. 'Resource_List.place': 'scatter'}
  142. j1 = Job(TEST_USER, attrs=a)
  143. j1id = self.server.submit(j1)
  144. # Wait for the job to start running.
  145. a = {ATTR_state: (EQ, 'R'), ATTR_substate: (EQ, 41)}
  146. self.server.expect(JOB, a, op=PTL_AND, id=j1id)
  147. self.server.log_match(
  148. "%s;vnode %s's parent mom.*has a pending copy hook "
  149. "or delete hook request.*" % (j1id, self.hostB),
  150. max_attempts=5, interval=1, regexp=True,
  151. starttime=start_time)
  152. def tearDown(self):
  153. self.momB.signal("-CONT")
  154. TestFunctional.tearDown(self)