pbs_cgroups_stress.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. from tests.performance import *
  37. def is_memsw_enabled(mem_path):
  38. """
  39. Check if system has swapcontrol enabled, then return true
  40. else return false
  41. """
  42. # List all files and check if memsw files exists
  43. for files in os.listdir(mem_path):
  44. if 'memory.memsw' in files:
  45. return 'true'
  46. return 'false'
  47. class TestCgroupsStress(TestPerformance):
  48. """
  49. This test suite targets Linux Cgroups hook stress.
  50. """
  51. def setUp(self):
  52. TestPerformance.setUp(self)
  53. self.true_script = """#!/bin/bash
  54. #PBS -joe
  55. /bin/true
  56. """
  57. self.cfg0 = """{
  58. "cgroup_prefix" : "pbspro",
  59. "exclude_hosts" : [],
  60. "exclude_vntypes" : [],
  61. "run_only_on_hosts" : [],
  62. "periodic_resc_update" : false,
  63. "vnode_per_numa_node" : false,
  64. "online_offlined_nodes" : false,
  65. "use_hyperthreads" : false,
  66. "cgroup" : {
  67. "cpuacct" : {
  68. "enabled" : false
  69. },
  70. "cpuset" : {
  71. "enabled" : false
  72. },
  73. "devices" : {
  74. "enabled" : false
  75. },
  76. "hugetlb" : {
  77. "enabled" : false
  78. },
  79. "memory":
  80. {
  81. "enabled" : true,
  82. "exclude_hosts" : [],
  83. "exclude_vntypes" : [],
  84. "soft_limit" : false,
  85. "default" : "256MB",
  86. "reserve_percent" : "0",
  87. "reserve_amount" : "0MB"
  88. },
  89. "memsw":
  90. {
  91. "enabled" : %s,
  92. "exclude_hosts" : [],
  93. "exclude_vntypes" : [],
  94. "default" : "256MB",
  95. "reserve_percent" : "0",
  96. "reserve_amount" : "128MB"
  97. }
  98. }
  99. }"""
  100. self.noprefix = False
  101. self.paths = self.get_paths()
  102. if not (self.paths['cpuset'] and self.paths['memory']):
  103. self.skipTest('cpuset or memory cgroup subsystem not mounted')
  104. self.swapctl = is_memsw_enabled(self.paths['memsw'])
  105. self.server.set_op_mode(PTL_CLI)
  106. self.server.cleanup_jobs(extend='force')
  107. Job.dflt_attributes[ATTR_k] = 'oe'
  108. # Configure the scheduler to schedule using vmem
  109. a = {'resources': 'ncpus,mem,vmem,host,vnode'}
  110. self.scheduler.set_sched_config(a)
  111. # Import the hook
  112. self.hook_name = 'pbs_cgroups'
  113. self.hook_file = os.path.join(self.server.pbs_conf['PBS_EXEC'],
  114. 'lib',
  115. 'python',
  116. 'altair',
  117. 'pbs_hooks',
  118. 'pbs_cgroups.PY')
  119. self.load_hook(self.hook_file)
  120. # Enable the cgroups hook
  121. conf = {'enabled': 'True', 'freq': 2}
  122. self.server.manager(MGR_CMD_SET, HOOK, conf, self.hook_name)
  123. # Restart mom so exechost_startup hook is run
  124. self.mom.signal('-HUP')
  125. def get_paths(self):
  126. """
  127. Returns a dictionary containing the location where each cgroup
  128. is mounted.
  129. """
  130. paths = {'pids': None,
  131. 'blkio': None,
  132. 'systemd': None,
  133. 'cpuset': None,
  134. 'memory': None,
  135. 'memsw': None,
  136. 'cpuacct': None,
  137. 'devices': None}
  138. # Loop through the mounts and collect the ones for cgroups
  139. with open(os.path.join(os.sep, 'proc', 'mounts'), 'r') as fd:
  140. for line in fd:
  141. entries = line.split()
  142. if entries[2] != 'cgroup':
  143. continue
  144. flags = entries[3].split(',')
  145. if 'noprefix' in flags:
  146. self.noprefix = True
  147. subsys = os.path.basename(entries[1])
  148. paths[subsys] = entries[1]
  149. if 'memory' in flags:
  150. paths['memsw'] = paths[subsys]
  151. paths['memory'] = paths[subsys]
  152. if 'cpuacct' in flags:
  153. paths['cpuacct'] = paths[subsys]
  154. if 'devices' in flags:
  155. paths['devices'] = paths[subsys]
  156. return paths
  157. def load_hook(self, filename):
  158. """
  159. Import and enable a hook pointed to by the URL specified.
  160. """
  161. try:
  162. with open(filename, 'r') as fd:
  163. script = fd.read()
  164. except IOError:
  165. self.assertTrue(False, "Failed to open hook file %s" % filename)
  166. events = '"execjob_begin,execjob_launch,execjob_attach,'
  167. events += 'execjob_epilogue,execjob_end,exechost_startup,'
  168. events += 'exechost_periodic"'
  169. a = {'enabled': 'True',
  170. 'freq': '2',
  171. 'event': events}
  172. self.server.create_import_hook(self.hook_name, a, script,
  173. overwrite=True)
  174. # Add the configuration
  175. self.load_config(self.cfg0 % self.swapctl)
  176. def load_config(self, cfg):
  177. """
  178. Create a hook configuration file with the provided contents.
  179. """
  180. fn = self.du.create_temp_file(body=cfg)
  181. a = {'content-type': 'application/x-config',
  182. 'content-encoding': 'default',
  183. 'input-file': fn}
  184. self.server.manager(MGR_CMD_IMPORT, HOOK, a, self.hook_name)
  185. os.remove(fn)
  186. self.mom.log_match('pbs_cgroups.CF;copy hook-related ' +
  187. 'file request received',
  188. max_attempts=5,
  189. starttime=self.server.ctime)
  190. self.logger.info("Current config: %s" % cfg)
  191. # Restart MoM to work around PP-993
  192. self.mom.restart()
  193. @timeout(1200)
  194. def test_cgroups_race_condition(self):
  195. """
  196. Test to ensure a cgroups event does not read the cgroups file system
  197. while another event is writing to it. By submitting 1000 instant jobs,
  198. the events should collide at least once.
  199. """
  200. pcpus = 0
  201. with open('/proc/cpuinfo', 'r') as desc:
  202. for line in desc:
  203. if re.match('^processor', line):
  204. pcpus += 1
  205. if pcpus < 8:
  206. self.skipTest("Test requires at least 8 physical CPUs")
  207. attr = {'job_history_enable': 'true'}
  208. self.server.manager(MGR_CMD_SET, SERVER, attr)
  209. self.load_config(self.cfg0 % self.swapctl)
  210. now = time.time()
  211. j = Job(TEST_USER, attrs={ATTR_J: '0-1000'})
  212. j.create_script(self.true_script)
  213. jid = self.server.submit(j)
  214. jid = jid.split(']')[0]
  215. done = False
  216. for i in range(0, 1000):
  217. # Build the subjob id and ensure it is complete
  218. sjid = jid + str(i) + "]"
  219. # If the array job is finished, it and all subjobs will be put
  220. # into the F state. This can happen while checking the last
  221. # couple of subjobs. If this happens, we need to check for the
  222. # F state instead of the X state.
  223. if done:
  224. self.server.expect(
  225. JOB, {'job_state': 'F'}, id=sjid, extend='x')
  226. else:
  227. try:
  228. self.server.expect(
  229. JOB, {'job_state': 'X'}, id=sjid, extend='tx')
  230. except PtlExpectError:
  231. # The expect failed, maybe because the array job finished
  232. # Check for the F state for this and future subjobs.
  233. done = True
  234. self.server.expect(
  235. JOB, {'job_state': 'F'}, id=sjid, extend='x')
  236. # Check the logs for IOError every 100 subjobs, to reduce time of
  237. # a failing test.
  238. if i % 100 == 0:
  239. self.mom.log_match(msg="IOError", starttime=now,
  240. existence=False, max_attempts=1, n="ALL")
  241. # Check the logs one last time to ensure it passed
  242. self.mom.log_match(msg="IOError", starttime=now,
  243. existence=False, max_attempts=10, n="ALL")