pbs_cray_vnode_per_numa.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.functional import *
  37. @tags('cray', 'mom', 'configuration')
  38. class TestVnodePerNumaNode(TestFunctional):
  39. """
  40. This test suite is for testing the new mom_priv configuration
  41. parameter, vnode_per_numa_node.
  42. Test that the information is correctly being compressed into one vnode
  43. using the default setting (equivalent to FALSE).
  44. """
  45. def setUp(self):
  46. if not self.du.get_platform().startswith('cray'):
  47. self.skipTest("Test suite only meant to run on a Cray")
  48. TestFunctional.setUp(self)
  49. @tags('cray', 'smoke')
  50. def test_settings(self):
  51. """
  52. vnode_per_numa_node is unset (defaults to FALSE).
  53. Set $vnode_per_numa_node to TRUE
  54. Sum up the ncpus, memory, and naccelerators for all vnodes that
  55. have the same host (i.e. NUMA nodes that belong to the same compute
  56. node).
  57. Unset $vnode_per_numa_node in mom_priv/config.
  58. Now for each host, compare the ncpus, mem, and naccelerators against
  59. the values we got when $vnode_per_numa_node was set to TRUE.
  60. They should be equal.
  61. Verify that PBS created only one vnode, and:
  62. - PBScrayseg attribute is not set
  63. - ncpus is a total from all NUMA nodes of that node
  64. - mem is a total from all NUMA nodes of that node
  65. - the naccelerators value is correct
  66. - the accelerator_memory value is correct
  67. Set $vnode_per_numa_node to FALSE.
  68. Compare the pbsnodes output when vnode_per_numa_node was unset
  69. versus when vnode_per_numa_node was set to False.
  70. """
  71. dncpus = {}
  72. dmem = {}
  73. dacc = {}
  74. daccmem = {}
  75. # First we mimic old behavior by setting vnode_per_numa_node to TRUE
  76. # Do not HUP now, we will do so when we reset the nodes
  77. rv = self.mom.add_config({'$vnode_per_numa_node': True}, False)
  78. self.assertTrue(rv)
  79. # Start from a clean slate, delete any existing nodes and re-create
  80. # them
  81. momname = self.mom.shortname
  82. self.reset_nodes(momname)
  83. # Get the pbsnodes -av output for comparison later
  84. vnodes_pernuma = self.server.status(NODE)
  85. for n in vnodes_pernuma:
  86. if n['resources_available.host'] not in dncpus.keys():
  87. dncpus[n['resources_available.host']] = int(
  88. n['resources_available.ncpus'])
  89. else:
  90. dncpus[n['resources_available.host']
  91. ] += int(n['resources_available.ncpus'])
  92. if n['resources_available.host'] not in dmem.keys():
  93. dmem[n['resources_available.host']] = int(
  94. n['resources_available.mem'][0:-2])
  95. else:
  96. dmem[n['resources_available.host']
  97. ] += int(n['resources_available.mem'][0:-2])
  98. if 'resources_available.naccelerators' in n.keys():
  99. if n['resources_available.naccelerators'][0] != '@':
  100. if n['resources_available.host'] not in dacc.keys():
  101. dacc[n['resources_available.host']] = int(
  102. n['resources_available.naccelerators'])
  103. else:
  104. dacc[n['resources_available.host']
  105. ] += int(n['resources_available.naccelerators'])
  106. if 'resources_available.accelerator_memory' in n.keys():
  107. if n['resources_available.accelerator_memory'][0] != '@':
  108. if n['resources_available.host'] not in daccmem.keys():
  109. daccmem[n['resources_available.host']] = int(
  110. n['resources_available.accelerator_memory'][0:-2])
  111. else:
  112. daccmem[n['resources_available.host']] += int(n[
  113. 'resources_available.accelerator_memory'][0:-2])
  114. # Remove the configuration setting and re-read the vnodes
  115. rv = self.mom.unset_mom_config('$vnode_per_numa_node', False)
  116. self.assertTrue(rv)
  117. self.reset_nodes(momname)
  118. vnodes_combined = self.server.status(NODE)
  119. # Compare the multiple vnodes values to the combined vnode output
  120. for n in vnodes_combined:
  121. if 'resources_available.PBScrayseg' in n:
  122. self.logger.error(
  123. "ERROR resources_available.PBScrayseg was found.")
  124. self.assertTrue(False)
  125. self.assertEqual(int(n['resources_available.ncpus']), dncpus[
  126. n['resources_available.host']])
  127. self.assertEqual(int(n['resources_available.mem'][0:-2]), dmem[
  128. n['resources_available.host']])
  129. if 'resources_available.naccelerators' in n:
  130. self.assertEqual(int(n['resources_available.naccelerators']),
  131. dacc[n['resources_available.host']])
  132. if 'resources_available.accelerator_memory' in n:
  133. self.assertEqual(int(n['resources_available.accelerator_memory'
  134. ][0:-2]),
  135. daccmem[n['resources_available.host']])
  136. # Set vnode_per_numa_node to FALSE and re-read the vnodes
  137. rv = self.mom.add_config({'$vnode_per_numa_node': False}, False)
  138. self.assertTrue(rv)
  139. self.reset_nodes(momname)
  140. vnodes_combined1 = self.server.status(NODE)
  141. # Compare the pbsnodes output when vnode_per_numa_node was unset
  142. # versus when vnode_per_numa_node was set to False.
  143. if (len(vnodes_combined) == len(vnodes_combined1)):
  144. self.logger.info(
  145. "pbsnodes outputs are equal in length.")
  146. for n in vnodes_combined:
  147. if n not in vnodes_combined1:
  148. self.logger.error(
  149. "ERROR vnode %s has differing element." % n['id'])
  150. self.assertTrue(False)
  151. self.logger.info(
  152. "pbsnodes outputs are the same.")
  153. else:
  154. self.logger.error(
  155. "ERROR pbsnodes outputs differ in length.")
  156. self.assertTrue(False)
  157. def restartPBS(self):
  158. try:
  159. svcs = PBSInitServices()
  160. svcs.restart()
  161. except PbsInitServicesError, e:
  162. self.logger.error("PBS restart failed: \n" + e.msg)
  163. self.assertTrue(e.rv)
  164. def reset_nodes(self, hostA):
  165. """
  166. Reset nodes.
  167. """
  168. # Remove all nodes
  169. rv = self.server.manager(MGR_CMD_DELETE, NODE, None, "")
  170. self.assertEqual(rv, 0)
  171. # Restart PBS
  172. self.restartPBS()
  173. # Create node
  174. rv = self.server.manager(MGR_CMD_CREATE, NODE, None, hostA)
  175. self.assertEqual(rv, 0)
  176. # Wait for 3 seconds for changes to take effect
  177. time.sleep(3)