# coding: utf-8
# Copyright (C) 1994-2018 Altair Engineering, Inc.
# For more information, contact Altair at www.altair.com.
#
# This file is part of the PBS Professional ("PBS Pro") software.
#
# Open Source License Information:
#
# PBS Pro is free software. You can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.
# See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
#
# Commercial License Information:
#
# For a copy of the commercial license terms and conditions,
# go to: (http://www.pbspro.com/UserArea/agreement.html)
# or contact the Altair Legal Department.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and
# distribute them - whether embedded or bundled with other software -
# under a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.
from tests.functional import *
def convert_time(fmt, tm, fixdate=False):
"""
Convert given time stamp into given format
if fixdate is True add before date if date is < 9
(This is because to match output with ctime as qstat uses it)
"""
rv = time.strftime(fmt, time.localtime(float(tm)))
if ((sys.platform not in ('cygwin', 'win32')) and (fixdate)):
rv = rv.split()
date = int(rv[2])
if date <= 9:
date = ' ' + str(date)
rv[2] = str(date)
rv = ' '.join(rv)
return rv
class TestPbsNodeRampDown(TestFunctional):
"""
This tests the Node Rampdown Feature,
where while a job is running, nodes/resources
assigned by non-mother superior can be released.
Custom parameters:
moms: colon-separated hostnames of three MoMs
"""
def transform_select(self, select):
"""
Takes a select substring:
"=:=...:="
and transform it so that if any of the resource
(res1, res2,...,resN) matches 'mem', and
the corresponding value has a suffix of 'gb',
then convert it too 'kb' value. Also,
this will attach a "1:' to the returned select
substring.
Ex:
% str = "ncpus=7:mem=2gb:ompthreads=3"
% transform_select(str)
1:ompthreads=3:mem=2097152kb:ncpus=7
"""
sel_list = select.split(':')
mystr = "1:"
for index in range(len(sel_list) - 1, -1, -1):
if (index != len(sel_list) - 1):
mystr += ":"
nums = [s for s in sel_list[index] if s.isdigit()]
key = sel_list[index].split('=')[0]
if key == "mem":
mystr += sel_list[index].\
replace(nums[0] + "gb",
str(int(nums[0]) * 1024 * 1024)) + "kb"
else:
mystr += sel_list[index]
return mystr
def pbs_nodefile_match_exec_host(self, jid, exec_host,
schedselect=None):
"""
Look into the PBS_NODEFILE on the first host listed in 'exec_host'
and returns True if all host entries in 'exec_host' match the entries
in the file. Otherwise, return False.
# Look for 'mpiprocs' values in 'schedselect' (if not None), and
# verify that the corresponding node hosts are appearing in
# PBS_NODEFILE 'mpiprocs' number of times.
"""
pbs_nodefile = os.path.join(self.server.
pbs_conf['PBS_HOME'], 'aux', jid)
# look for mpiprocs settings
mpiprocs = []
if schedselect is not None:
select_list = schedselect.split('+')
for chunk in select_list:
chl = chunk.split(':')
for ch in chl:
if ch.find('=') != -1:
c = ch.split('=')
if c[0] == "mpiprocs":
mpiprocs.append(c[1])
ehost = exec_host.split('+')
first_host = ehost[0].split('/')[0]
cmd = ['cat', pbs_nodefile]
ret = self.server.du.run_cmd(first_host, cmd, sudo=False)
ehost2 = []
for h in ret['out']:
ehost2.append(h.split('.')[0])
ehost1 = []
j = 0
for eh in ehost:
h = eh.split('/')
if (len(mpiprocs) > 0):
for k in range(int(mpiprocs[j])):
ehost1.append(h[0])
else:
ehost1.append(h[0])
j += 1
if cmp(ehost1, ehost2) != 0:
return False
return True
def license_count_match(self, num_licenses):
"""
This will fail on an assert if server's license_count used value
does not equal 'num_licenses'
"""
n = retry = 5
for _ in range(n):
server_stat = self.server.status(SERVER, 'license_count')
lic_count = server_stat[0]['license_count']
for lic in lic_count.split():
lic_split = lic.split(':')
if lic_split[0] == 'Used':
actual_licenses = int(lic_split[1])
if actual_licenses == num_licenses:
return
break
retry -= 1
if retry == 0:
raise AssertionError("not found %d licenses" % (num_licenses,))
self.logger.info("sleeping 3 secs before next retry")
time.sleep(3)
def match_accounting_log(self, atype, jid, exec_host, exec_vnode,
mem, ncpus, nodect, place, select):
"""
This checks if there's an accounting log record 'atype' for
job 'jid' containing the values given (i.e.
Resource_List.exec_host, Resource_List.exec_vnode, etc...)
This throws an exception upon encountering a non-matching
accounting_logs entry.
Some example values of 'atype' are: 'u' (update record due to
release node request), 'c' (record containing the next
set of resources to be used by a phased job as a result of
release node request), 'e' (last update record for a phased job
due to a release node request), 'E' (end of job record).
"""
self.server.accounting_match(
msg=".*%s;%s.*exec_host=%s.*" % (atype, jid, exec_host),
regexp=True, n=20)
self.server.accounting_match(
msg=".*%s;%s.*exec_vnode=%s.*" % (atype, jid, exec_vnode),
regexp=True, n=20)
self.server.accounting_match(
msg=".*%s;%s.*Resource_List\.mem=%s.*" % (atype, jid, mem),
regexp=True, n=20)
self.server.accounting_match(
msg=".*%s;%s.*Resource_List\.ncpus=%d.*" % (atype, jid, ncpus),
regexp=True, n=20)
self.server.accounting_match(
msg=".*%s;%s.*Resource_List\.nodect=%d.*" % (atype, jid, nodect),
regexp=True, n=20)
self.server.accounting_match(
msg=".*%s;%s.*Resource_List\.place=%s.*" % (atype, jid, place),
regexp=True, n=20)
self.server.accounting_match(
msg=".*%s;%s.*Resource_List\.select=%s.*" % (atype, jid, select),
regexp=True, n=20)
if atype != 'c':
self.server.accounting_match(
msg=".*%s;%s.*resources_used\..*" % (atype, jid),
regexp=True, n=20)
def match_vnode_status(self, vnode_list, state, jobs=None, ncpus=None,
mem=None):
"""
Given a list of vnode names in 'vnode_list', check to make
sure each vnode's state, jobs string, resources_assigned.mem,
and resources_assigned.ncpus match the passed arguments.
This will throw an exception if a match is not found.
"""
for vn in vnode_list:
dict_match = {'state': state}
if jobs is not None:
dict_match['jobs'] = jobs
if ncpus is not None:
dict_match['resources_assigned.ncpus'] = ncpus
if mem is not None:
dict_match['resources_assigned.mem'] = mem
self.server.expect(VNODE, dict_match, id=vn)
def create_and_submit_job(self, job_type, attribs={}):
"""
create the job object and submit it to the server
based on 'job_type' and attributes list 'attribs'.
"""
retjob = Job(TEST_USER, attrs=attribs)
if job_type == 'job1':
retjob.create_script(self.script['job1'])
elif job_type == 'job1_1':
retjob.create_script(self.script['job1_1'])
elif job_type == 'job1_2':
retjob.create_script(self.script['job1_2'])
elif job_type == 'job1_3':
retjob.create_script(self.script['job1_3'])
elif job_type == 'job1_5':
retjob.create_script(self.script['job1_5'])
elif job_type == 'job1_6':
retjob.create_script(self.script['job1_6'])
elif job_type == 'job1_extra_res':
retjob.create_script(self.script['job1_extra_res'])
elif job_type == 'job2':
retjob.create_script(self.script['job2'])
elif job_type == 'job3':
retjob.create_script(self.script['job3'])
elif job_type == 'job5':
retjob.create_script(self.script['job5'])
elif job_type == 'job11':
retjob.create_script(self.script['job11'])
elif job_type == 'job11x':
retjob.create_script(self.script['job11x'])
elif job_type == 'job12':
retjob.create_script(self.script['job12'])
elif job_type == 'job13':
retjob.create_script(self.script['job13'])
elif job_type == 'jobA':
retjob.create_script(self.script['jobA'])
return self.server.submit(retjob)
def setUp(self):
if len(self.moms) != 3:
self.skip_test(reason="need 3 mom hosts: -p moms=::")
TestFunctional.setUp(self)
Job.dflt_attributes[ATTR_k] = 'oe'
self.server.cleanup_jobs(extend="force")
self.momA = self.moms.values()[0]
self.momB = self.moms.values()[1]
self.momC = self.moms.values()[2]
# Now start setting up and creating the vnodes
self.server.manager(MGR_CMD_DELETE, NODE, None, "")
# set node momA
self.hostA = self.momA.shortname
self.momA.delete_vnode_defs()
vnode_prefix = self.hostA
a = {'resources_available.mem': '1gb',
'resources_available.ncpus': '1'}
vnodedef = self.momA.create_vnode_def(vnode_prefix, a, 4)
self.assertNotEqual(vnodedef, None)
self.momA.insert_vnode_def(vnodedef, 'vnode.def')
self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA)
# set node momB
self.hostB = self.momB.shortname
self.momB.delete_vnode_defs()
vnode_prefix = self.hostB
a = {'resources_available.mem': '1gb',
'resources_available.ncpus': '1'}
vnodedef = self.momB.create_vnode_def(vnode_prefix, a, 5,
usenatvnode=True)
self.assertNotEqual(vnodedef, None)
self.momB.insert_vnode_def(vnodedef, 'vnode.def')
self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB)
# set node momC
# This one has no vnode definition.
self.hostC = self.momC.shortname
self.momC.delete_vnode_defs()
self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC)
a = {'resources_available.ncpus': 2,
'resources_available.mem': '2gb'}
# set natural vnode of hostC
self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostC,
expect=True)
a = {'state': 'free', 'resources_available.ncpus': (GE, 1)}
self.server.expect(VNODE, {'state=free': 11}, op=EQ, count=True,
max_attempts=10, interval=2)
# Various node names
self.n0 = self.hostA
self.n1 = '%s[0]' % (self.hostA,)
self.n2 = '%s[1]' % (self.hostA,)
self.n3 = '%s[2]' % (self.hostA,)
self.n4 = self.hostB
self.n5 = '%s[0]' % (self.hostB,)
self.n6 = '%s[1]' % (self.hostB,)
self.n7 = self.hostC
self.n8 = '%s[3]' % (self.hostA,)
self.n9 = '%s[2]' % (self.hostB,)
self.n10 = '%s[3]' % (self.hostB,)
if sys.platform in ('cygwin', 'win32'):
SLEEP_CMD = "pbs-sleep"
else:
SLEEP_CMD = "/bin/sleep"
self.pbs_release_nodes_cmd = os.path.join(
self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes')
FIB40 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(40)\\\")"'
FIB45 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(45)\\\")"'
FIB50 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(50)\\\")"'
FIB400 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \
'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \
return i\\n return fib(i-1) + fib(i-2)\\n\\nprint fib(400)\\\")"'
# job submission arguments
self.script = {}
self.job1_select = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=2:mem=2gb"
self.job1_place = "scatter"
# expected values upon successful job submission
self.job1_schedselect = "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+" + \
"1:ncpus=2:mem=2gb"
self.job1_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
self.n0, self.n4, self.n7)
self.job1_exec_vnode = \
"(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
"%s:ncpus=1)+" % (self.n3) + \
"(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
"%s:ncpus=1)+" % (self.n6,) + \
"(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
self.job1_sel_esc = self.job1_select.replace("+", "\+")
self.job1_exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
self.job1_exec_vnode_esc = self.job1_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace("(", "\(").replace(
")", "\)").replace("+", "\+")
self.job1_newsel = self.transform_select(self.job1_select.split(
'+')[0])
self.job1_new_exec_host = self.job1_exec_host.split('+')[0]
self.job1_new_exec_vnode = self.job1_exec_vnode.split(')')[0] + ')'
self.job1_new_exec_vnode_esc = \
self.job1_new_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace(
"+", "\+")
self.script['job1'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
"#PBS -W stageout=test.img@%s:test.img\n" % (self.n4,) + \
"#PBS -W release_nodes_on_stageout=true\n" + \
"dd if=/dev/zero of=test.img count=1024 bs=1048576\n" + \
"pbsdsh -n 1 -- %s\n" % (FIB40,) + \
"pbsdsh -n 2 -- %s\n" % (FIB40,) + \
"%s\n" % (FIB50,)
self.script['job1_1'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
"#PBS -W stageout=test.img@%s:test.img\n" % (self.n4,) + \
"#PBS -W release_nodes_on_stageout=false\n" + \
"dd if=/dev/zero of=test.img count=1024 bs=1048576\n" + \
"pbsdsh -n 1 -- %s\n" % (FIB40,) + \
"pbsdsh -n 2 -- %s\n" % (FIB40,) + \
"%s\n" % (FIB50,)
self.script['job1_2'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
"#PBS -W stageout=test.img@%s:test.img\n" % (self.n4,) + \
"dd if=/dev/zero of=test.img count=1024 bs=1048576\n" + \
"pbsdsh -n 1 -- %s\n" % (FIB40,) + \
"pbsdsh -n 2 -- %s\n" % (FIB40,) + \
"%s\n" % (FIB50,)
self.script['job1_3'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
SLEEP_CMD + " 5\n" + \
"pbs_release_nodes -a\n" + \
"%s\n" % (FIB50,)
self.script['job1_5'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
"pbsdsh -n 1 -- %s &\n" % (FIB45,) + \
"pbsdsh -n 2 -- %s &\n" % (FIB45,) + \
"%s\n" % (FIB45,)
self.script['jobA'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
"#PBS -J 1-5\n"\
"pbsdsh -n 1 -- %s &\n" % (FIB45,) + \
"pbsdsh -n 2 -- %s &\n" % (FIB45,) + \
"%s\n" % (FIB45,)
self.script['job1_6'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
SLEEP_CMD + " 5\n" + \
self.pbs_release_nodes_cmd + " " + self.n4 + "\n" + \
"%s\n" % (FIB50,)
self.job1_extra_res_select = \
"ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
"ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
"ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
self.job1_extra_res_place = "scatter"
self.job1_extra_res_schedselect = \
"1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+" + \
"1:ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+" + \
"1:ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
self.job1_extra_res_exec_host = "%s/0*0+%s/0*0+%s/0*2" % (
self.n0, self.n4, self.n7)
self.job1_extra_res_exec_vnode = \
"(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
"%s:ncpus=1)+" % (self.n3,) + \
"(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
"%s:ncpus=1)+" % (self.n6,) + \
"(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
self.script['job1_extra_res'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job1_extra_res_select + "\n" + \
"#PBS -l place=" + self.job1_extra_res_place + "\n" + \
"pbsdsh -n 1 -- %s &\n" % (FIB40,) + \
"pbsdsh -n 2 -- %s &\n" % (FIB40,) + \
"%s\n" % (FIB50,)
self.job2_select = "ncpus=1:mem=1gb+ncpus=4:mem=4gb+ncpus=2:mem=2gb"
self.job2_place = "scatter"
self.job2_schedselect = "1:ncpus=1:mem=1gb+1:ncpus=4:mem=4gb+" + \
"1:ncpus=2:mem=2gb"
self.job2_exec_host = "%s/1+%s/1*0+%s/1*2" % (
self.n0, self.n4, self.n7)
self.job2_exec_vnode = \
"(%s:ncpus=1:mem=1048576kb)+" % (self.n8,) + \
"(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n9,) + \
"%s:mem=1048576kb:ncpus=1)+" % (self.n10,) + \
"(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
self.job2_exec_vnode_var1 = \
"(%s:ncpus=1:mem=1048576kb)+" % (self.n8,) + \
"(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n6,) + \
"%s:mem=1048576kb:ncpus=1)+" % (self.n9,) + \
"(%s:ncpus=2:mem=2097152kb)" % (self.n7,)
self.script['job2'] = \
"#PBS -l select=" + self.job2_select + "\n" + \
"#PBS -l place=" + self.job2_place + "\n" + \
SLEEP_CMD + " 60\n"
self.script['job3'] = \
"#PBS -l select=vnode=" + self.n4 + "+vnode=" + self.n0 + \
":mem=4mb\n" + SLEEP_CMD + " 30\n"
self.script['job5'] = \
"#PBS -l select=vnode=" + self.n0 + ":mem=4mb\n" + \
SLEEP_CMD + " 300\n"
self.job11x_select = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=1:mem=1gb"
self.job11x_place = "scatter:excl"
self.job11x_schedselect = "1:ncpus=3:mem=2gb+" + \
"1:ncpus=3:mem=2gb+1:ncpus=1:mem=1gb"
self.job11x_exec_host = "%s/0*0+%s/0*0+%s/0" % (
self.n0, self.n4, self.n7)
self.job11x_exec_vnode = \
"(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
"%s:ncpus=1)+" % (self.n3,) + \
"(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
"%s:ncpus=1)+" % (self.n6,) + \
"(%s:ncpus=1:mem=1048576kb)" % (self.n7,)
self.script['job11x'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job11x_select + "\n" + \
"#PBS -l place=" + self.job11x_place + "\n" + \
"pbsdsh -n 1 -- %s\n" % (FIB40,) + \
"pbsdsh -n 2 -- %s\n" % (FIB40,) + \
"%s\n" % (FIB50,)
self.job11_select = "ncpus=3:mem=2gb+ncpus=3:mem=2gb+ncpus=1:mem=1gb"
self.job11_place = "scatter"
self.job11_schedselect = "1:ncpus=3:mem=2gb+1:ncpus=3:mem=2gb+" + \
"1:ncpus=1:mem=1gb"
self.job11_exec_host = "%s/0*0+%s/0*0+%s/0" % (
self.n0, self.n4, self.n7)
self.job11_exec_vnode = \
"(%s:mem=1048576kb:ncpus=1+" % (self.n1,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n2,) + \
"%s:ncpus=1)+" % (self.n3,) + \
"(%s:mem=1048576kb:ncpus=1+" % (self.n4,) + \
"%s:mem=1048576kb:ncpus=1+" % (self.n5,) + \
"%s:ncpus=1)+" % (self.n6,) + \
"(%s:ncpus=1:mem=1048576kb)" % (self.n7,)
self.script['job11'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job11_select + "\n" + \
"#PBS -l place=" + self.job11_place + "\n" + \
"pbsdsh -n 1 -- %s\n" % (FIB40,) + \
"pbsdsh -n 2 -- %s\n" % (FIB40,) + \
"%s\n" % (FIB50,)
self.job12_select = "vnode=%s:ncpus=1:mem=1gb" % (self.n7,)
self.job12_schedselect = "1:vnode=%s:ncpus=1:mem=1gb" % (self.n7,)
self.job12_place = "free"
self.job12_exec_host = "%s/1" % (self.n7,)
self.job12_exec_vnode = "(%s:ncpus=1:mem=1048576kb)" % (self.n7,)
self.script['job12'] = \
"#PBS -l select=" + self.job12_select + "\n" + \
"#PBS -l place=" + self.job12_place + "\n" + \
SLEEP_CMD + " 60\n"
self.job13_select = "3:ncpus=1"
self.script['job13'] = \
"#PBS -S /bin/bash\n" \
"#PBS -l select=" + self.job13_select + "\n" + \
"#PBS -l place=" + self.job1_place + "\n" + \
"pbsdsh -n 1 -- %s\n" % (FIB400,) + \
"pbsdsh -n 2 -- %s\n" % (FIB400,) + \
"pbsdsh -n 3 -- %s\n" % (FIB400,)
def tearDown(self):
self.momA.signal("-CONT")
self.momB.signal("-CONT")
self.momC.signal("-CONT")
TestFunctional.tearDown(self)
# Delete managers and operators if added
attrib = ['operators', 'managers']
self.server.manager(MGR_CMD_UNSET, SERVER, attrib, expect=True)
def test_release_nodes_on_stageout_true(self):
"""
Test:
qsub -W release_nodes_on_stageout=true job.script
where job.script specifies a select spec of
2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter".
With release_nodes_on_stageout=true option, when
job is deleted and runs a lengthy stageout process,
only the primary execution host's
vnodes are left assigned to the job.
"""
# Inside job1's script contains the
# directive to release_nodes_on_stageout=true
jid = self.create_and_submit_job('job1')
self.server.expect(JOB, {'job_state': 'R',
'release_nodes_on_stageout': 'True',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Deleting the job will trigger the stageout process
# at which time sister nodes are automatically released
# due to release_nodes_stageout=true set
self.server.delete(jid)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n4), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n7), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_release_nodes_on_stageout_false(self):
"""
Test:
qsub -W release_nodes_on_stageout=False job.script
where job.script specifies a select spec of
2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter".
With release_nodes_on_stageout=false option, when job is
deleted and runs a lengthy stageout process, nothing
changes in job's vnodes assignment.
"""
# Inside job1_1's script contains the
# directive to release_nodes_on_stageout=false
jid = self.create_and_submit_job('job1_1')
self.server.expect(JOB, {'job_state': 'R',
'release_nodes_on_stageout': 'False',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Deleting a job should not trigger automatic
# release of nodes due to release_nodes_stagout=False
self.server.delete(jid)
# Verify mom_logs
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
# Verify no change in remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_nodes_on_stageout_default(self):
"""
Test:
qsub: no -Wrelease_nodes_on_stageout
option given.
Job runs as normal.
"""
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.delete(jid)
# Verify mom_logs
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
# Verify no change in remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10],
'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_nodes_on_stageout_true_qalter(self):
"""
Test:
qalter -W release_nodes_on_stageout=true.
After running job is modified by qalter,
with release_nodes_on_stageout=true option, when
job is deleted and runs a lengthy stageout process,
only the primary execution host's
vnodes are left assigned to the job.
"""
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# run qalter -Wrelease_nodes_on_stageout=true
self.server.alterjob(jid,
{ATTR_W: 'release_nodes_on_stageout=true'})
self.server.expect(JOB, {'release_nodes_on_stageout': 'True'}, id=jid)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# This triggers the lengthy stageout process
self.server.delete(jid)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1,
1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_release_nodes_on_stageout_false_qalter(self):
"""
Test:
qalter -W release_nodes_on_stageout=False.
After running job is modified by qalter,
With release_nodes_on_stageout=false option, when job is
deleted and runs a lengthy stageout process, nothing
changes in job's vnodes assignment.
"""
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# run qalter -Wrelease_nodes_on_stageout=true
self.server.alterjob(jid,
{ATTR_W: 'release_nodes_on_stageout=false'})
self.server.expect(JOB, {'release_nodes_on_stageout': 'False'}, id=jid)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# This triggers long stageout process
self.server.delete(jid)
# Verify mom_logs
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
# Verify no change in remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_hook_release_nodes_on_stageout_true(self):
"""
Test:
Using a queuejob hook to set
release_nodes_on_stageout=true.
When job is deleted and runs a
lengthy stageout process, only
the primary execution host's
vnodes are left assigned to the job.
"""
hook_body = """
import pbs
pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
pbs.event().job.release_nodes_on_stageout=True
"""
hook_event = "queuejob"
hook_name = "qjob"
a = {'event': hook_event, 'enabled': 'true'}
self.server.create_import_hook(hook_name, a, hook_body)
jid = self.create_and_submit_job('job1_2')
self.server.log_match("queuejob hook executed", n=20,
max_attempts=25, interval=2)
self.server.expect(JOB, {'job_state': 'R',
'release_nodes_on_stageout': 'True',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Deleting the job will trigger the stageout process
# at which time sister nodes are automatically released
# due to release_nodes_stageout=true set
self.server.delete(jid)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n4,), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_hook_release_nodes_on_stageout_false(self):
"""
Test:
Using a queuejob hook to set
-Wrelease_nodes_on_stageout=False.
When job is deleted and runs a
lengthy stageout process, nothing
changes in job's vnodes assignment.
"""
hook_body = """
import pbs
pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
pbs.event().job.release_nodes_on_stageout=False
"""
hook_event = "queuejob"
hook_name = "qjob"
a = {'event': hook_event, 'enabled': 'true'}
self.server.create_import_hook(hook_name, a, hook_body)
jid = self.create_and_submit_job('job1_2')
self.server.log_match("queuejob hook executed", n=20,
max_attempts=25, interval=2)
self.server.expect(JOB, {'job_state': 'R',
'release_nodes_on_stageout': 'False',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Deleting a job should not trigger automatic
# release of nodes due to release_nodes_stagout=False
self.server.delete(jid)
# Verify mom_logs
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
# Verify no change in remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_hook2_release_nodes_on_stageout_true(self):
"""
Test:
Using a modifyjob hook to set
release_nodes_on_stageout=true.
When job is deleted and runs a
lengthy stageout process, only
the primary execution host's
vnodes are left assigned to the job.
"""
hook_body = """
import pbs
pbs.logmsg(pbs.LOG_DEBUG, "modifyjob hook executed")
pbs.event().job.release_nodes_on_stageout=True
"""
hook_event = "modifyjob"
hook_name = "mjob"
a = {'event': hook_event, 'enabled': 'true'}
self.server.create_import_hook(hook_name, a, hook_body)
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# This triggers the modifyjob hook
self.server.alterjob(jid, {ATTR_N: "test"})
self.server.log_match("modifyjob hook executed", n=100,
max_attempts=25, interval=2)
self.server.expect(JOB, {'release_nodes_on_stageout': 'True'}, id=jid)
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Deleting the job will trigger the stageout process
# at which time sister nodes are automatically released
# due to release_nodes_stageout=true set
self.server.delete(jid)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_hook2_release_nodes_on_stageout_false(self):
"""
Test:
Using a modifyjob hook to set
release_nodes_on_stageout=False.
When job is deleted and runs a
lengthy stageout process, nothing
changes in job's vnodes assignment.
"""
hook_body = """
import pbs
pbs.logmsg(pbs.LOG_DEBUG, "modifyjob hook executed")
pbs.event().job.release_nodes_on_stageout=False
"""
hook_event = "modifyjob"
hook_name = "mjob"
a = {'event': hook_event, 'enabled': 'true'}
self.server.create_import_hook(hook_name, a, hook_body)
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# This triggers the modifyjob hook
self.server.alterjob(jid, {ATTR_N: "test"})
self.server.log_match("modifyjob hook executed", n=100,
max_attempts=25, interval=2)
self.server.expect(JOB, {'release_nodes_on_stageout': 'False'}, id=jid)
# Deleting a job should not trigger automatic
# release of nodes due to release_nodes_stagout=False
self.server.delete(jid)
# Verify mom_logs
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=5, interval=1,
existence=False)
# Verify no change in remaining job resources.
self.server.expect(JOB, {'job_state': 'E',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_nodes_error(self):
"""
Tests erroneous cases:
- pbs_release_nodes (no options given)
- pbs_release_nodes -j (and nothing else)
- pbs_release_nodes -a (not run inside a job)
- pbs_release_nodes -j -a
(both -a and listed nodes are given)
- pbs_release_nodes -j -a
- pbs_release_nodes -j -a
and job is not in a running state.
Returns the appropriate error message.
"""
# Test no option given
cmd = [self.pbs_release_nodes_cmd]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith('usage:'))
# test only -j given
cmd = [self.pbs_release_nodes_cmd, '-j', '23']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith('usage:'))
# test only -a given
cmd = [self.pbs_release_nodes_cmd, '-a']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
'pbs_release_nodes: No jobid given'))
# Test specifying an unknown job id
cmd = [self.pbs_release_nodes_cmd, '-j', '300000', '-a']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
'pbs_release_nodes: Unknown Job Id 300000'))
# Test having '-a' and vnode parameter given to pbs_release_nodes
a = {'Resource_List.select': '3:ncpus=1',
'Resource_List.place': 'scatter'}
jid = self.create_and_submit_job('job', a)
cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a', self.n4]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith('usage:'))
self.server.delete(jid)
# Test pbs_release_nodes' permission
jid = self.create_and_submit_job('job', a)
self.server.expect(JOB, {'job_state': 'R'}, id=jid)
# Run pbs_release_nodes as the executing user != TEST_USER
cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER1)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
'pbs_release_nodes: Unauthorized Request'))
self.server.delete(jid)
# Test pbs_release_nodes on a non-running job
a = {'Resource_List.select': '3:ncpus=1',
ATTR_h: None,
'Resource_List.place': 'scatter'}
jid = self.create_and_submit_job('job', a)
self.server.expect(JOB, {'job_state': 'H'}, id=jid)
# Run pbs_release_nodes
cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
'pbs_release_nodes: Request invalid for state of job'))
def test_release_ms_nodes(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j where
is a mother superior vnode, results in
entire request to get rejected.
"""
jid = self.create_and_submit_job('job1')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Run pbs_release_nodes
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6,
self.n1, self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
"pbs_release_nodes: " +
"Can't free '%s' since " % (self.n1,) +
"it's on a primary execution host"))
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_not_assigned_nodes(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing:
pbs_release_nodes -j
with means such node is not assigned to the job.
entire request to get rejected.
Result:
Returns an error message and no nodes get released.
"""
jid = self.create_and_submit_job('job1')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n8, self.n6, self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
"pbs_release_nodes: node(s) requested " +
"to be released not " +
"part of the job: %s" % (self.n8,)))
# Ensure nothing has changed with the job.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_cray_nodes(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing:
pbs_release_nodes -j
where is a Cray node,
Result:
Returns an error message and no nodes get released.
"""
jid = self.create_and_submit_job('job1')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Set hostC node to be of cray type
a = {'resources_available.vntype': 'cray_login'}
# set natural vnode of hostC
self.server.manager(MGR_CMD_SET, NODE, a, id=self.n7,
expect=True)
# Run pbs_release_nodes
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n6, self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
"pbs_release_nodes: not currently supported " +
"on Cray X* series nodes: "
"%s" % (self.n7,)))
# Ensure nothing has changed with the job.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_cpuset_nodes(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing:
pbs_release_nodes -j
where is a cpuset node,
Result:
Returns an error message and no nodes get released.
"""
jid = self.create_and_submit_job('job1')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Set hostB to be of cpuset type
a = {'resources_available.arch': 'linux_cpuset'}
# set natural vnode of hostC
self.server.manager(MGR_CMD_SET, NODE, a, id=self.n7,
expect=True)
# Run pbs_release_nodes
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n6, self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertNotEqual(ret['rc'], 0)
self.assertTrue(ret['err'][0].startswith(
"pbs_release_nodes: not currently supported on nodes whose " +
"resources are part of a cpuset: %s" % (self.n7,)))
# Ensure nothing has changed with the job.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy',
jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# Check for no existence of account update ('u') record
self.server.accounting_match(
msg='.*u;' + jid + ".*exec_host=%s.*" % (self.job1_exec_host_esc,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
# Check for no existence of account next ('c') record
self.server.accounting_match(
msg='.*c;' + jid + ".*exec_host=%s.*" % (self.job1_new_exec_host,),
regexp=True, n=20, existence=False, max_attempts=5, interval=1)
def test_release_nodes_all(self):
"""
Test:
Given a job that specifies a select spec of
2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter".
Calling
pbs_release_nodes -j -a
will result in all the sister nodes getting
unassigned from the job.
"""
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as regular user
cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_release_nodes_all_as_root(self):
"""
Test:
Same test as test_release_nodes_all except the pbs_release_nodes
call is executed by root. Result is the same.
"""
jid = self.create_and_submit_job('job1_2')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, '-a']
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_release_nodes_all_inside_job(self):
"""
Test:
Like test_release_all test except instead of calling
pbs_release_nodes from a command line, it is executed
inside the job script of a running job. Same results.
"""
# This one has a job script that calls 'pbs_release_nodes'
# (no jobid specified)
jid = self.create_and_submit_job('job1_3')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# wait for the job to execute pbs_release_nodes
time.sleep(10)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
def test_release_nodes1(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j
results in:
1. node no longer appearing in job's
exec_vnode value,
2. resources associated with the
node are taken out of job's Resources_List.*,
schedselect values,
3. Since node is just one of the vnodes in the
host assigned to the second super-chunk, the node
still won't accept new jobs until all the other
allocated vnodes from the same mom host are released.
The resources then assigned to the job from
node continues to be assigned including
corresponding licenses.
NOTE: This is testing to make sure the position of
in the exec_vnode string (left end of a super-chunk) will
not break the recreation of the attribute value after
release.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB's host will not get DELETE_JOB2 request since
# not all its vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
"1:ncpus=2:mem=2097152kb"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_exec_host
new_exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
new_exec_vnode_esc = new_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '5gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': self.job1_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_exec_host_esc,
new_exec_vnode_esc, "5242880kb",
7, 3, self.job1_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"5242880kb", 7, 3,
self.job1_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb",
8, 3, self.job1_place, self.job1_sel_esc)
def test_release_nodes1_as_user(self):
"""
Test:
Same as test_release_nodes1 except pbs_release_nodes
is executed by as regular user. Same results.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as regular user
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB and momC's hosts will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
"1:ncpus=2:mem=2097152kb"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_exec_host
new_exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
new_exec_vnode_esc = new_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '5gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': self.job1_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_exec_host_esc,
new_exec_vnode_esc, "5242880kb",
7, 3, self.job1_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"5242880kb", 7, 3,
self.job1_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb",
8, 3, self.job1_place, self.job1_sel_esc)
def test_release_nodes1_extra(self):
"""
Test:
Like test_release_nodes1 except instead of the super-chunk
and chunks getting only ncpus and mem values, additional
resources mpiprocs and ompthreads are also requested and
assigned:
For example:
qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
We want to make sure the ompthreads and mpiprocs values are
preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
the host names are duplicated according to the number of
mpiprocs. For example, if is assigned to first
chunk, with mpiprocs=3, will appear 3 times in
$PBS_NODEFILE.
"""
jid = self.create_and_submit_job('job1_extra_res')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select':
self.job1_extra_res_select,
'Resource_List.place':
self.job1_extra_res_place,
'schedselect':
self.job1_extra_res_schedselect,
'exec_host':
self.job1_extra_res_exec_host,
'exec_vnode':
self.job1_extra_res_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# inside pbs_nodefile_match_exec_host() function, takes care of
# verifying that the host names appear according to the number of
# mpiprocs assigned to the chunk.
self.assertTrue(
self.pbs_nodefile_match_exec_host(
jid, self.job1_extra_res_exec_host,
self.job1_extra_res_schedselect))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB and momC's hosts will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
sel_esc = self.job1_extra_res_select.replace("+", "\+")
exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = \
self.job1_extra_res_exec_vnode.replace(
"[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace(
"+", "\+")
newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
"1:mem=1048576kb:ncpus=2:mpiprocs=3:ompthreads=3+" + \
"1:ncpus=2:mem=2097152kb:mpiprocs=2:ompthreads=2"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_extra_res_exec_host
new_exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB,
{'job_state': 'R',
'Resource_List.mem': '5gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_extra_res_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, exec_host_esc,
exec_vnode_esc, "6gb", 8, 3,
self.job1_extra_res_place,
sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "5242880kb",
7, 3, self.job1_extra_res_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"5242880kb", 7, 3,
self.job1_extra_res_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, exec_host_esc,
exec_vnode_esc, "6gb",
8, 3, self.job1_extra_res_place,
sel_esc)
@timeout(400)
def test_release_nodes2(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j
results in:
1. node no longer appearing in job's
exec_vnode value,
2. resources associated with the
node are taken out of job's Resources_List.*,
schedselect values,
3. Since node is just one of the vnodes in the
host assigned to the second super-chunk, the node
still won't accept new jobs until all the other
allocated vnodes from the same mom host are released.
The resources then assigned to the job from
node continues to be assigned including
corresponding licenses.
NOTE: This is testing to make sure the position of
in the exec_vnode string (middle of a super-chunk) will
not break the recreation of the attribute value after
release.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB and momC's hosts will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=2+" + \
"1:ncpus=2:mem=2097152kb"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_exec_host
new_exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
new_exec_vnode_esc = new_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '5gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_exec_host_esc,
new_exec_vnode_esc, "5242880kb",
7, 3, self.job1_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"5242880kb", 7, 3,
self.job1_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb",
8, 3, self.job1_place, self.job1_sel_esc)
def test_release_nodes2_extra(self):
"""
Test:
Like test_release_nodes2 except instead of the super-chunk
and chunks getting only ncpus and mem values, additional
resources mpiprocs and ompthreads are also requested and
assigned:
For example:
qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
We want to make sure the ompthreads and mpiprocs values are
preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
the host names are duplicated according to the number of
mpiprocs. For example, if is assigned to first
chunk, with mpiprocs=3, will appear 3 times in
$PBS_NODEFILE.
"""
jid = self.create_and_submit_job('job1_extra_res')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select':
self.job1_extra_res_select,
'Resource_List.place':
self.job1_extra_res_place,
'schedselect':
self.job1_extra_res_schedselect,
'exec_host':
self.job1_extra_res_exec_host,
'exec_vnode':
self.job1_extra_res_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# inside pbs_nodefile_match_exec_host() function, takes care of
# verifying that the host names appear according to the number of
# mpiprocs assigned to the chunk.
self.assertTrue(
self.pbs_nodefile_match_exec_host(
jid, self.job1_extra_res_exec_host,
self.job1_extra_res_schedselect))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB and momC's hosts will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
sel_esc = self.job1_extra_res_select.replace("+", "\+")
exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_extra_res_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
"1:mem=1048576kb:ncpus=2:mpiprocs=3:ompthreads=3+" + \
"1:ncpus=2:mem=2097152kb:mpiprocs=2:ompthreads=2"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_extra_res_exec_host
new_exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
new_exec_vnode_esc = new_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB,
{'job_state': 'R',
'Resource_List.mem': '5gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_extra_res_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, exec_host_esc,
exec_vnode_esc, "6gb", 8, 3,
self.job1_extra_res_place,
sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "5242880kb",
7, 3, self.job1_extra_res_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"5242880kb", 7, 3,
self.job1_extra_res_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, exec_host_esc,
exec_vnode_esc, "6gb",
8, 3, self.job1_extra_res_place,
sel_esc)
@timeout(400)
def test_release_nodes3(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j
results in:
1. node no longer appearing in job's
exec_vnode value,
2. resources associated with the
node are taken out of job's Resources_List.*,
schedselect values,
3. Since node is just one of the vnodes in the
host assigned to the second super-chunk, the node
still won't accept new jobs until all the other
allocated vnodes from the same mom host are released.
The resources then assigned to the job from
node continues to be assigned including
corresponding licenses.
NOTE: This is testing to make sure the position of
in the exec_vnode string (right end of a super-chunk) will
not break the recreation of the attribute value after
release.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n6]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB and momC's hosts will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3+1:mem=2097152kb:ncpus=2+" + \
"1:ncpus=2:mem=2097152kb"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_exec_host
new_exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace(
"+", "\+")
new_exec_vnode = self.job1_exec_vnode.replace(
"+%s:ncpus=1" % (self.n6,), "")
new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still # assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_exec_host_esc,
new_exec_vnode_esc, "6291456kb",
7, 3, self.job1_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"6291456kb", 7, 3,
self.job1_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb",
8, 3, self.job1_place, self.job1_sel_esc)
@timeout(400)
def test_release_nodes3_extra(self):
"""
Test:
Like test_release_nodes3 except instead of the super-chunk
and chunks getting only ncpus and mem values, additional
resources mpiprocs and ompthreads are also requested and
assigned:
For example:
qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
We want to make sure the ompthreads and mpiprocs values are
preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
the host names are duplicated according to the number of
mpiprocs. For example, if is assigned to first
chunk, with mpiprocs=3, will appear 3 times in
$PBS_NODEFILE.
"""
jid = self.create_and_submit_job('job1_extra_res')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select':
self.job1_extra_res_select,
'Resource_List.place':
self.job1_extra_res_place,
'schedselect':
self.job1_extra_res_schedselect,
'exec_host':
self.job1_extra_res_exec_host,
'exec_vnode':
self.job1_extra_res_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# inside pbs_nodefile_match_exec_host() function, takes care of
# verifying that the host names appear according to the number of
# mpiprocs assigned to the chunk.
self.assertTrue(
self.pbs_nodefile_match_exec_host(
jid, self.job1_extra_res_exec_host,
self.job1_extra_res_schedselect))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n6]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10,
regexp=True,
existence=False, max_attempts=5, interval=1)
# momB and momC's hosts will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
# Verify remaining job resources.
sel_esc = self.job1_extra_res_select.replace("+", "\+")
exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_extra_res_exec_vnode.replace(
"[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
"1:mem=2097152kb:ncpus=2:mpiprocs=3:ompthreads=3+" + \
"1:ncpus=2:mem=2097152kb:mpiprocs=2:ompthreads=2"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_extra_res_exec_host
new_exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
"+%s:ncpus=1" % (self.n6,), "")
new_exec_vnode_esc = new_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB,
{'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 7,
'Resource_List.select': newsel,
'Resource_List.place':
self.job1_extra_res_place,
'Resource_List.nodect': 3,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=7 taking away released vnode
# , it's coming from a super-chunk where other vnodes and
# are still assigned to the job. So the parent mom of
# till won't release the job and thus, the 1 license for it is still
# allocated.
self.license_count_match(8)
# Check account update ('u') record
self.match_accounting_log('u', jid, exec_host_esc,
exec_vnode_esc, "6gb", 8, 3,
self.job1_extra_res_place,
sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "6291456kb",
7, 3, self.job1_extra_res_place, newsel_esc)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.server.expect(SERVER, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 8,
'resources_assigned.mem': '6291456kb'},
id="workq")
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"6291456kb", 7, 3,
self.job1_extra_res_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, exec_host_esc,
exec_vnode_esc, "6gb",
8, 3, self.job1_extra_res_place,
sel_esc)
def test_release_nodes4(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j
results in:
1. node , , and are no longer appearing in
job's exec_vnode value,
2. resources associated with the released
nodes are taken out of job's Resources_List.*,
schedselect values,
3. Since nodes and are some of the vnodes in the
host assigned to the second super-chunk, the node
still won't accept new jobs until all the other
allocated vnodes () from the same mom host are
released.
4. The resources then assigned to the job from
node and continue to be assigned including
corresponding licenses.
5. is the only vnode assigned to the host mapped
to the third chunk so it's fully deallocated and
its assigned resources are removed from the job.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# momB's host will not get job summary reported but
# momC's host will get the job summary since all vnodes
# from the host have been released.
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10, regexp=True, existence=False,
max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10, regexp=True)
# momB's host will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
# momC's host will get DELETE_JOB2 request since sole vnnode
# has been released from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
# Ensure the 'fib' process is gone on hostC when DELETE_JOB request
# is received
self.server.pu.get_proc_info(
self.momC.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Verify remaining job resources.
sel_esc = self.job1_select.replace("+", "\+")
exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3+1:ncpus=1"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_exec_host.replace(
"+%s/0*2" % (self.n7,), "")
new_exec_host_esc = new_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
new_exec_vnode = new_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
new_exec_vnode = new_exec_vnode.replace(
"+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
new_exec_vnode_esc = new_exec_vnode.replace(
"[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 4,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 2,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=4 taking away released vnode
# (1 cpu), (1 cpu), (2 cpus),
# only got released. and are part of a super
# chunk that wasn't fully released.
self.license_count_match(6)
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "2097152kb",
4, 2, self.job1_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"2097152kb", 4, 2,
self.job1_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb",
8, 3, self.job1_place, self.job1_sel_esc)
def test_release_nodes4_extra(self):
"""
Test:
Like test_release_nodes4 except instead of the super-chunk
and chunks getting only ncpus and mem values, additional
resources mpiprocs and ompthreads are also requested and
assigned:
For example:
qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
We want to make sure the ompthreads and mpiprocs values are
preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
the host names are duplicated according to the number of
mpiprocs. For example, if is assigned to first
chunk, with mpiprocs=3, will appear 3 times in
$PBS_NODEFILE.
"""
jid = self.create_and_submit_job('job1_extra_res')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select':
self.job1_extra_res_select,
'Resource_List.place':
self.job1_extra_res_place,
'schedselect':
self.job1_extra_res_schedselect,
'exec_host':
self.job1_extra_res_exec_host,
'exec_vnode':
self.job1_extra_res_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# inside pbs_nodefile_match_exec_host() function, takes care of
# verifying that the host names appear according to the number of
# mpiprocs assigned to the chunk.
self.assertTrue(
self.pbs_nodefile_match_exec_host(
jid, self.job1_extra_res_exec_host,
self.job1_extra_res_schedselect))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# momB's host will not get job summary reported but
# momC's host will get the job summary since all vnodes
# from the host have been released.
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10, regexp=True, existence=False,
max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10, regexp=True)
# momB's host will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
# momC will get DELETE_JOB2 request since sole vnode
# has been released from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
# Ensure the 'fib' process is gone from hostC when DELETE_JOB request
# received
self.server.pu.get_proc_info(
self.momC.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Verify remaining job resources.
sel_esc = self.job1_extra_res_select.replace("+", "\+")
exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_extra_res_exec_vnode.replace(
"[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
"1:ncpus=1:mpiprocs=3:ompthreads=3"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_extra_res_exec_host.replace(
"+%s/0*2" % (self.n7,), "")
new_exec_host_esc = new_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n4,), "")
new_exec_vnode = new_exec_vnode.replace(
"%s:mem=1048576kb:ncpus=1+" % (self.n5,), "")
new_exec_vnode = new_exec_vnode.replace(
"+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
new_exec_vnode_esc = new_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 4,
'Resource_List.select': newsel,
'Resource_List.place':
self.job1_extra_res_place,
'Resource_List.nodect': 2,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=4 taking away released vnode
# (1 cpu), (1 cpu), (2 cpus),
# only got released. and are part of a super
# chunk that wasn't fully released.
self.license_count_match(6)
# Check account update ('u') record
self.match_accounting_log('u', jid, exec_host_esc,
exec_vnode_esc, "6gb", 8, 3,
self.job1_extra_res_place,
sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "2097152kb",
4, 2, self.job1_extra_res_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"2097152kb", 4, 2,
self.job1_extra_res_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, exec_host_esc,
exec_vnode_esc, "6gb",
8, 3, self.job1_extra_res_place,
sel_esc)
def test_release_nodes5(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j
results in:
1. node , , and are no longer appearing in
job's exec_vnode value,
2. resources associated with the released
nodes are taken out of job's Resources_List.*,
schedselect values,
3. Since nodes and are some of the vnodes in the
host assigned to the second super-chunk, the node
still won't accept new jobs until all the other
allocated vnodes () from the same mom host are
released.
4. The resources then assigned to the job from
node and continue to be assigned including
corresponding licenses.
5. is the only vnode assigned to the host mapped
to the third chunk so it's fully deallocated and
its assigned resources are removed from the job.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6,
self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# momB's host will not get job summary reported but
# momC's host will get the job summary since all vnodes
# from the host have been released.
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10, regexp=True, existence=False,
max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10, regexp=True)
# momB's host will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
# momC will get DELETE_JOB2 request since sole vnode
# has been released from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
# Ensure the 'fib' process is gone from hostC when DELETE_JOB request
# received
self.server.pu.get_proc_info(
self.momC.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Verify remaining job resources.
sel_esc = self.job1_select.replace("+", "\+")
exec_host_esc = self.job1_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = self.job1_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3+1:mem=1048576kb:ncpus=1"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_exec_host.replace(
"+%s/0*2" % (self.n7,), "")
new_exec_host_esc = new_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_exec_vnode.replace(
"+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
new_exec_vnode = new_exec_vnode.replace(
"+%s:ncpus=1" % (self.n6,), "")
new_exec_vnode = new_exec_vnode.replace(
"+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
new_exec_vnode_esc = \
new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '3gb',
'Resource_List.ncpus': 4,
'Resource_List.select': newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 2,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=4 taking away released vnode
# (1 cpu), (1 cpu), (2 cpus),
# only got released. and are part of a super
# chunk that wasn't fully released.
self.license_count_match(6)
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "3145728kb",
4, 2, self.job1_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
# still job-busy
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
# still job-busy
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
# now free
self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"3145728kb", 4, 2,
self.job1_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb",
8, 3, self.job1_place, self.job1_sel_esc)
def test_release_nodes5_extra(self):
"""
Test:
Like test_release_nodes5 except instead of the super-chunk
and chunks getting only ncpus and mem values, additional
resources mpiprocs and ompthreads are also requested and
assigned:
For example:
qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
We want to make sure the ompthreads and mpiprocs values are
preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
the host names are duplicated according to the number of
mpiprocs. For example, if is assigned to first
chunk, with mpiprocs=3, will appear 3 times in
$PBS_NODEFILE.
"""
jid = self.create_and_submit_job('job1_extra_res')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select':
self.job1_extra_res_select,
'Resource_List.place':
self.job1_extra_res_place,
'schedselect':
self.job1_extra_res_schedselect,
'exec_host':
self.job1_extra_res_exec_host,
'exec_vnode':
self.job1_extra_res_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
# inside pbs_nodefile_match_exec_host() function, takes care of
# verifying that the host names appear according to the number of
# mpiprocs assigned to the chunk.
self.assertTrue(
self.pbs_nodefile_match_exec_host(
jid, self.job1_extra_res_exec_host,
self.job1_extra_res_schedselect))
# Run pbs_release_nodes as root
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n5, self.n6,
self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
sudo=True)
self.assertEqual(ret['rc'], 0)
# momB's host will not get job summary reported but
# momC's host will get the job summary since all vnodes
# from the host have been released.
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostB), n=10, regexp=True, existence=False,
max_attempts=5, interval=1)
self.momA.log_match("Job;%s;%s.+cput=.+ mem=.+" % (
jid, self.hostC), n=10, regexp=True)
# momB's host will not get DELETE_JOB2 request since
# not all their vnodes have been released yet from the job.
# momC will get DELETE_JOB2 request since sole vnode
# has been released from the job.
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
existence=False, max_attempts=5, interval=1)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20)
# Ensure the 'fib' process is gone from hostC when DELETE_JOB request
# received
self.server.pu.get_proc_info(
self.momC.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Verify remaining job resources.
sel_esc = self.job1_extra_res_select.replace("+", "\+")
exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = \
self.job1_extra_res_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace("(", "\(").replace(")", "\)").replace(
"+", "\+")
newsel = \
"1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2+" + \
"1:mem=1048576kb:ncpus=1:mpiprocs=3:ompthreads=3"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_extra_res_exec_host.replace(
"+%s/0*2" % (self.n7,), "")
new_exec_host_esc = new_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
"+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
new_exec_vnode = new_exec_vnode.replace(
"+%s:ncpus=1" % (self.n6,), "")
new_exec_vnode = new_exec_vnode.replace(
"+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
new_exec_vnode_esc = \
new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '3gb',
'Resource_List.ncpus': 4,
'Resource_List.select': newsel,
'Resource_List.place':
self.job1_extra_res_place,
'Resource_List.nodect': 2,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# Though the job is listed with ncpus=4 taking away released vnode
# (1 cpu), (1 cpu), (2 cpus),
# only got released. and are part of a super
# chunk that wasn't fully released.
self.license_count_match(6)
# Check account update ('u') record
self.match_accounting_log('u', jid, exec_host_esc,
exec_vnode_esc, "6gb", 8, 3,
self.job1_extra_res_place,
sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
new_exec_vnode_esc, "3145728kb",
4, 2, self.job1_extra_res_place, newsel_esc)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
# still job-busy
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
# still job-busy
self.match_vnode_status([self.n3, self.n6], 'job-busy', jobs_assn1, 1,
'0kb')
# is now free
self.match_vnode_status([self.n0, self.n7, self.n8, self.n9, self.n10],
'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 6,
'resources_assigned.mem': '4194304kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, new_exec_host, newsel))
self.server.delete(jid)
# Check account phased end ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc,
"3145728kb", 4, 2,
self.job1_extra_res_place,
newsel_esc)
# Check to make sure 'E' (end of job) record got generated
self.match_accounting_log('E', jid, exec_host_esc,
exec_vnode_esc, "6gb",
8, 3, self.job1_extra_res_place,
sel_esc)
def test_release_nodes6(self):
"""
Test:
Given: a job that has been submitted with a select spec
of 2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an
exec_vnode=
(++)+(++)+()
Executing pbs_release_nodes -j
is equivalent to doing 'pbs_release_nodes -a' which
will have the same result as test_release_nodes_all.
That is, all sister nodes assigned to the job are
released early from the job.
"""
jid = self.create_and_submit_job('job1_5')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select': self.job1_select,
'Resource_List.place': self.job1_place,
'schedselect': self.job1_schedselect,
'exec_host': self.job1_exec_host,
'exec_vnode': self.job1_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_exec_host))
# Run pbs_release_nodes as regular user
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n6, self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Ensure the 'fib' process is gone when DELETE_JOB2 received on momB
self.server.pu.get_proc_info(
self.momB.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Ensure the 'fib' process is gone when DELETE_JOB2 received on momC
self.server.pu.get_proc_info(
self.momC.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Verify remaining job resources.
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': self.job1_newsel,
'Resource_List.place': self.job1_place,
'Resource_List.nodect': 1,
'schedselect': self.job1_newsel,
'exec_host': self.job1_new_exec_host,
'exec_vnode': self.job1_new_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
# nodes , , , are all free now
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'},
id="workq")
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid, self.job1_new_exec_host))
# Check account update ('u') record
self.match_accounting_log('u', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, self.job1_newsel)
# For job to end to get the end records in the accounting_logs
self.server.delete(jid)
# Check account phased end job ('e') record
self.match_accounting_log('e', jid, self.job1_new_exec_host,
self.job1_new_exec_vnode_esc, "2097152kb", 3,
1, self.job1_place, self.job1_newsel)
# Check account end of job ('E') record
self.match_accounting_log('E', jid, self.job1_exec_host_esc,
self.job1_exec_vnode_esc, "6gb", 8, 3,
self.job1_place,
self.job1_sel_esc)
def test_release_nodes6_extra(self):
"""
Test:
Like test_release_nodes6 except instead of the super-chunk
and chunks getting only ncpus and mem values, additional
resources mpiprocs and ompthreads are also requested and
assigned:
For example:
qsub -l select="ncpus=3:mem=2gb:mpiprocs=3:ompthreads=2+
ncpus=3:mem=2gb:mpiprocs=3:ompthreads=3+
ncpus=2:mem=2gb:mpiprocs=2:ompthreads=2"
We want to make sure the ompthreads and mpiprocs values are
preserved in the new exec_vnode, and that in the $PBS_NODEFILE,
the host names are duplicated according to the number of
mpiprocs. For example, if is assigned to first
chunk, with mpiprocs=3, will appear 3 times in
$PBS_NODEFILE.
"""
jid = self.create_and_submit_job('job1_extra_res')
self.server.expect(JOB, {'job_state': 'R',
'Resource_List.mem': '6gb',
'Resource_List.ncpus': 8,
'Resource_List.nodect': 3,
'Resource_List.select':
self.job1_extra_res_select,
'Resource_List.place':
self.job1_extra_res_place,
'schedselect':
self.job1_extra_res_schedselect,
'exec_host': self.job1_extra_res_exec_host,
'exec_vnode': self.job1_extra_res_exec_vnode},
id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(8)
# Check various vnode status.
jobs_assn1 = "%s/0" % (jid,)
self.match_vnode_status([self.n1, self.n2, self.n4, self.n5],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3, self.n6],
'job-busy', jobs_assn1, 1, '0kb')
jobs_assn2 = "%s/0, %s/1" % (jid, jid)
self.match_vnode_status([self.n7], 'job-busy', jobs_assn2,
2, '2097152kb')
self.match_vnode_status([self.n0, self.n8, self.n9, self.n10], 'free')
self.assertTrue(
self.pbs_nodefile_match_exec_host(jid,
self.job1_extra_res_exec_host,
self.job1_extra_res_schedselect))
# Run pbs_release_nodes as regular user
cmd = [self.pbs_release_nodes_cmd, '-j', jid, self.n4, self.n5,
self.n6, self.n7]
ret = self.server.du.run_cmd(self.server.hostname, cmd,
runas=TEST_USER)
self.assertEqual(ret['rc'], 0)
# Verify mom_logs
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostB), n=10,
max_attempts=18, interval=2, regexp=True)
self.momA.log_match(
"Job;%s;%s.+cput=.+ mem=.+" % (jid, self.hostC), n=10,
max_attempts=18, interval=2, regexp=True)
self.momB.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
self.momC.log_match("Job;%s;DELETE_JOB2 received" % (jid,), n=20,
max_attempts=18, interval=2)
# Ensure the 'fib' process is gone when DELETE_JOB2 received on momB
self.server.pu.get_proc_info(
self.momB.hostname, ".*fib.*", None)
self.assertEqual(len(self.server.pu.processes), 0)
# Ensure the 'fib' process is gone when DELETE_JOB2 received on momC
self.server.pu.get_proc_info(
self.momC.hostname, ".*fib.*", None, regexp=True)
self.assertEqual(len(self.server.pu.processes), 0)
# Verify remaining job resources.
sel_esc = self.job1_extra_res_select.replace("+", "\+")
exec_host_esc = self.job1_extra_res_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
exec_vnode_esc = \
self.job1_extra_res_exec_vnode.replace("[", "\[").replace(
"]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
newsel = "1:mem=2097152kb:ncpus=3:mpiprocs=3:ompthreads=2"
newsel_esc = newsel.replace("+", "\+")
new_exec_host = self.job1_extra_res_exec_host.replace(
"+%s/0*2" % (self.n7,), "")
new_exec_host = new_exec_host.replace("+%s/0*0" % (self.n4,), "")
new_exec_host_esc = new_exec_host.replace(
"*", "\*").replace("[", "\[").replace("]", "\]").replace("+", "\+")
new_exec_vnode = self.job1_extra_res_exec_vnode.replace(
"+%s:mem=1048576kb:ncpus=1" % (self.n5,), "")
new_exec_vnode = new_exec_vnode.replace(
"+%s:ncpus=1" % (self.n6,), "")
new_exec_vnode = new_exec_vnode.replace(
"+(%s:mem=1048576kb:ncpus=1)" % (self.n4,), "")
new_exec_vnode = new_exec_vnode.replace(
"+(%s:ncpus=2:mem=2097152kb)" % (self.n7,), "")
new_exec_vnode_esc = \
new_exec_vnode.replace("[", "\[").replace("]", "\]").replace(
"(", "\(").replace(")", "\)").replace("+", "\+")
self.server.expect(JOB,
{'job_state': 'R',
'Resource_List.mem': '2gb',
'Resource_List.ncpus': 3,
'Resource_List.select': newsel,
'Resource_List.place':
self.job1_extra_res_place,
'Resource_List.nodect': 1,
'schedselect': newsel,
'exec_host': new_exec_host,
'exec_vnode': new_exec_vnode}, id=jid)
# server's license_count used value matches job's 'ncpus' value.
self.license_count_match(3)
# Check various vnode status.
self.match_vnode_status([self.n1, self.n2],
'job-busy', jobs_assn1, 1, '1048576kb')
self.match_vnode_status([self.n3], 'job-busy', jobs_assn1, 1, '0kb')
self.server.expect(SERVER, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'})
self.server.expect(QUEUE, {'resources_assigned.ncpus': 3,
'resources_assigned.mem': '2097152kb'},
id="workq")
# nodes , , , are all free now
self.match_vnode_status([self.n0, self.n4, self.n5, self.n6,
self.n7, self.n8, self.n9, self.n10], 'free')
# Ensure the $PBS_NODEFILE contents account for the mpiprocs value;
# that is, each node hostname is listed 'mpiprocs' number of times in
# the file.
self.assertTrue(
self.pbs_nodefile_match_exec_host(
jid, self.job1_new_exec_host, newsel))
# Check account update ('u') record
self.match_accounting_log('u', jid, exec_host_esc,
exec_vnode_esc,
"6gb", 8, 3,
self.job1_extra_res_place,
sel_esc)
# Check to make sure 'c' (next) record got generated
self.match_accounting_log('c', jid, new_exec_host_esc,
self.job1_new_exec_vnode_esc, "2097152kb",
3, 1, self.job1_place, newsel_esc)
# For job to end to get the end records in the accounting_logs
self.server.delete(jid)
# Check account phased end job ('e') record
self.match_accounting_log('e', jid, new_exec_host_esc,
new_exec_vnode_esc, "2097152kb", 3,
1, self.job1_place, newsel_esc)
# Check account end of job ('E') record
self.match_accounting_log('E', jid, exec_host_esc,
exec_vnode_esc, "6gb", 8, 3,
self.job1_place, sel_esc)
# longer timeout needed as the following test takes a bit
# longer waiting for job to finish due to stage out
@timeout(400)
def test_release_nodes_cmd_plus_stageout(self):
"""
Test:
This test calling pbs_release_nodes command on a job
submitted with release_nodes_on_stageout option.
Given a job submitted as:
qsub -W release_nodes_on_stageout=true job.script
where job.script specifies a select spec of
2 super-chunks of ncpus=3 and mem=2gb each,
and 1 chunk of ncpus=2 and mem=2gb, along with
place spec of "scatter", resulting in an:
exec_vnode=(++)+(++)+()
Then issue:
pbs_release_nodes -j
This would generate a 'u' and 'c' accounting record.
while