pbs_mom.8B 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402
  1. .\" Copyright (C) 1994-2018 Altair Engineering, Inc.
  2. .\" For more information, contact Altair at www.altair.com.
  3. .\"
  4. .\" This file is part of the PBS Professional ("PBS Pro") software.
  5. .\"
  6. .\" Open Source License Information:
  7. .\"
  8. .\" PBS Pro is free software. You can redistribute it and/or modify it under the
  9. .\" terms of the GNU Affero General Public License as published by the Free
  10. .\" Software Foundation, either version 3 of the License, or (at your option) any
  11. .\" later version.
  12. .\"
  13. .\" PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  14. .\" WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  15. .\" FOR A PARTICULAR PURPOSE.
  16. .\" See the GNU Affero General Public License for more details.
  17. .\"
  18. .\" You should have received a copy of the GNU Affero General Public License
  19. .\" along with this program. If not, see <http://www.gnu.org/licenses/>.
  20. .\"
  21. .\" Commercial License Information:
  22. .\"
  23. .\" For a copy of the commercial license terms and conditions,
  24. .\" go to: (http://www.pbspro.com/UserArea/agreement.html)
  25. .\" or contact the Altair Legal Department.
  26. .\"
  27. .\" Altair’s dual-license business model allows companies, individuals, and
  28. .\" organizations to create proprietary derivative works of PBS Pro and
  29. .\" distribute them - whether embedded or bundled with other software -
  30. .\" under a commercial license agreement.
  31. .\"
  32. .\" Use of Altair’s trademarks, including but not limited to "PBS™",
  33. .\" "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  34. .\" trademark licensing policies.
  35. .\"
  36. .TH pbs_mom 8B "8 November 2017" Local "PBS Professional"
  37. .SH NAME
  38. .B pbs_mom
  39. - run the PBS job monitoring and execution daemon
  40. .SH SYNOPSIS
  41. .B pbs_mom
  42. [-a <alarm timeout>]
  43. [-C <checkpoint directory>]
  44. .RS 8
  45. [-c <config file>]
  46. [-d <MoM home directory>]
  47. [-L <logfile>]
  48. .br
  49. [-M <MoM service port>]
  50. [-N]
  51. [-n <nice value>]
  52. [-p|-r]
  53. .br
  54. [-R <inter-MoM communication port>]
  55. [-S <server port>]
  56. .br
  57. [-s script_options]
  58. .RE
  59. .B pbs_mom
  60. --version
  61. .SH DESCRIPTION
  62. The
  63. .B pbs_mom
  64. command starts the PBS job monitoring and execution daemon, called
  65. MoM.
  66. The standard MoM starts jobs on the execution host, monitors and reports
  67. resource usage, enforces resource usage limits, and notifies the
  68. server when the job is finished. The MoM also runs any prologue
  69. scripts before the job runs, and runs any epilogue scripts after the
  70. job runs.
  71. The MoM performs any communication with job tasks and with other MoMs.
  72. The MoM on the first vnode on which a job is running manages
  73. communication with the MoMs on the remaining vnodes on which the job
  74. runs.
  75. The MoM manages one or more vnodes. PBS may treat a host as
  76. a set of virtual nodes, in which case one MoM
  77. manages all of the host's vnodes. See the
  78. .B PBS Professional Administrator's Guide.
  79. .B Logging
  80. .br
  81. The MoM's log file is in PBS_HOME/mom_logs. The MoM writes an
  82. error message in its log file when it encounters any error. If it
  83. cannot write to its log file, it writes to standard error. The
  84. MoM writes events to its log file.
  85. The MoM writes its PBS
  86. version and build information to the logfile whenever it starts up or
  87. the logfile is rolled to a new file.
  88. .B Required Permission
  89. .br
  90. The executable for
  91. .B pbs_mom
  92. is in PBS_EXEC/sbin, and can be run only by root on Linux, and Admin
  93. on Windows.
  94. .B Cpusets
  95. .br
  96. A cpusetted machine can have a "boot cpuset" defined by the
  97. administrator. A boot cpuset contains one or more CPUs and memory
  98. boards and is used to restrict the default placement of system
  99. processes, including login. If defined, the boot cpuset contains
  100. CPU 0.
  101. Run parallel jobs exclusively within a cpuset for repeatability of
  102. performance. HPE SGI states, "Using cpusets on an HPE SGI system improves
  103. cache locality and memory access times and can substantially improve
  104. an application's performance and runtime repeatability."
  105. The CPUSET_CPU_EXCLUSIVE flag prevents CPU 0 from being used by
  106. the MoM in the creation of job cpusets. This flag is set by default,
  107. so this is the default behavior.
  108. To find out which cpuset is assigned to a running job, use
  109. .B qstat
  110. -f
  111. to see the
  112. .I cpuset
  113. field in the job's
  114. .I altid
  115. attribute.
  116. .B HPE SGI Machine Running Supported Versions of Performance Software -
  117. Message Passing Interface
  118. .br
  119. The cpusets created for jobs are marked cpu-exclusive.
  120. MoM does not use any CPU which was in use at startup.
  121. A PBS job can run across multiple machines that run supported versions
  122. of Performance Software - Message Passing Interface.
  123. PBS can run using HPE SGI's MPI (MPT) over InfiniBand. See the
  124. .B PBS Professional Administrator's Guide.
  125. .LP
  126. .B Effect on Jobs of Starting MoM
  127. .br
  128. When MoM is started or restarted, her default behavior is to leave
  129. any running processes running, but to tell the PBS server to requeue
  130. the jobs she manages. MoM tracks the process ID of jobs across
  131. restarts.
  132. In order to have all jobs killed and requeued, use the
  133. .I r
  134. option when starting or restarting MoM.
  135. In order to leave any running processes running, and not to requeue
  136. any jobs, use the
  137. .I p
  138. option when starting or restarting MoM.
  139. .SH OPTIONS
  140. .IP "-a <alarm timeout>" 10
  141. Number of seconds before alarm timeout.
  142. Whenever a resource request is processed, an alarm is set for the
  143. given amount of time. If the request has not completed before
  144. .I alarm timeout,
  145. the OS generates an alarm signal and sends it to MoM.
  146. Default: 10 seconds. Format: integer.
  147. .IP "-C <checkpoint directory>" 10
  148. Specifies the path of the directory where MoM creates job-specific
  149. subdirectories used to hold each job's restart files. MoM passes this
  150. path to checkpoint and restart scripts. Overrides other checkpoint
  151. path specification methods. Any directory specified with the
  152. .I -C
  153. option must be owned, readable, writable, and executable by root only
  154. .I (rwx,---,---, or 0700),
  155. to protect the security of the checkpoint files. See the
  156. .I -d
  157. option. Format: string.
  158. .br
  159. Default: PBS_HOME/spool/checkpoint.
  160. .IP "-c <config file>" 10
  161. MoM will read this alternate default configuration file upon starting.
  162. If this is a relative file name it will be relative to
  163. PBS_HOME/mom_priv. If the specified file cannot be opened,
  164. .B pbs_mom
  165. will abort. See the
  166. .I -d
  167. option.
  168. MoM's normal operation, when the -c option is not given, is to attempt
  169. to open the default configuration file PBS_HOME/mom_priv/config.
  170. If this file is not present,
  171. .B pbs_mom
  172. will log the fact and continue.
  173. .IP "-d <MoM home directory>" 10
  174. Specifies the path of the
  175. .I directory
  176. to be used in place of PBS_HOME by
  177. .B pbs_mom.
  178. The default directory is given by $PBS_HOME. Format: string.
  179. .IP "-L <logfile>" 10
  180. Specifies an absolute path and filename for the log file.
  181. The default is a file named for the current date in PBS_HOME/mom_logs/.
  182. See the
  183. .I -d
  184. option. Format: string.
  185. .IP "-M <MoM port>" 10
  186. Specifies the number of the port on which MoM will
  187. listen for server requests and instructions. Overrides
  188. PBS_MOM_SERVICE_PORT setting in pbs.conf and environment variable.
  189. Default: 15002.
  190. Format: integer port number.
  191. .IP "-n <nice value>" 10
  192. Specifies the priority for the
  193. .B pbs_mom
  194. daemon. Format: integer.
  195. .IP "-N" 10
  196. Specifies that when starting, MoM should not detach from the
  197. current session.
  198. .IP "-p" 10
  199. Specifies that when starting, MoM should allow any running jobs
  200. to continue running, and not have them requeued. This option
  201. can be used for single-host jobs only; multi-host jobs cannot
  202. be preserved.
  203. Cannot be used with the
  204. .I -r
  205. option.
  206. MoM is not the parent of these jobs.
  207. .RS
  208. .IP "HPE SGI systems running Performance Software - Message Passing Interface" 5
  209. The cpuset-enabled
  210. .B pbs_mom
  211. will, if given the
  212. .I -p
  213. flag, use the existing CPU and memory allocations for the /PBSPro
  214. cpusets.
  215. The default behavior is to remove these cpusets.
  216. Should this fail, MoM will exit, asking to be restarted with the
  217. .I -p
  218. flag.
  219. .LP
  220. .RE
  221. .IP "-r" 10
  222. Specifies that when starting, MoM should requeue any rerunnable jobs and
  223. kill any non-rerunnable jobs that
  224. she was tracking, and mark the
  225. jobs as terminated. Cannot be used with the
  226. .I -p
  227. option.
  228. MoM is not the parent of these jobs.
  229. It is not recommended to use the
  230. .I -r
  231. option after a reboot, because process IDs of new, legitimate tasks
  232. may match those MoM was previously tracking. If they match and MoM is
  233. started with the
  234. .I -r
  235. option, MoM will kill the new tasks.
  236. .IP "-R <inter-MoM communication port>" 10
  237. Specifies the number of the port on which MoM will listen for pings,
  238. resource information requests, communication from other MoMs, etc.
  239. Overrides PBS_MANAGER_SERVICE_PORT setting in pbs.conf and environment variable.
  240. Default: 15003. Format: integer port number.
  241. .IP "-S <server port>" 10
  242. Specifies the port number on which
  243. .B pbs_mom
  244. initially contacts the server. Default: 15001. Format: integer port number.
  245. .IP "-s <script options>" 5
  246. This option provides an interface that allows the administrator to
  247. add, delete, and display MoM's configuration files. See
  248. .B CONFIGURATION FILES.
  249. The
  250. .I script options
  251. are used this way:
  252. .RS
  253. .IP "-s insert <scriptname> <inputfile>" 5
  254. Reads
  255. .I inputfile
  256. and inserts its contents in a new site-defined
  257. .B pbs_mom
  258. configuration file with the filename
  259. .I scriptname.
  260. If a
  261. site-defined configuration file with the name
  262. .I scriptname
  263. already exists,
  264. the operation fails, a diagnostic is presented, and
  265. .B pbs_mom
  266. exits with a nonzero status. Scripts whose names begin with
  267. the prefix "PBS" are reserved. An attempt to add a script
  268. whose name begins with "PBS" will fail.
  269. .B pbs_mom will print a diagnostic message and exit
  270. with a nonzero status. Example:
  271. .B pbs_mom -s insert <scriptname> <inputfile>
  272. .IP "-s remove <scriptname>" 5
  273. The configuration file named
  274. .I scriptname
  275. is removed
  276. if it exists. If the given name does not exist or if an
  277. attempt is made to remove a script with the reserved "PBS"
  278. prefix, the operation fails, a diagnostic is presented, and
  279. .B pbs_mom
  280. exits with a nonzero status. Example:
  281. .B pbs_mom -s remove <scriptname>
  282. .IP "-s show <scriptname>" 5
  283. Causes the contents of the named script to be printed to
  284. standard output. If
  285. .I scriptname
  286. does not exist, the
  287. operation fails, a diagnostic is presented, and
  288. .B pbs_mom
  289. exits with a nonzero status. Example:
  290. .B pbs_mom -s show <scriptname>
  291. .IP "-s list" 5
  292. Causes
  293. .B pbs_mom
  294. to list the set of PBS-prefixed and site-defined configuration
  295. files in the order in which they are executed. Example:
  296. .B pbs_mom -s list
  297. .LP
  298. .B WINDOWS:
  299. .RS 5
  300. Under Windows, the
  301. .I -N
  302. option must be used, so that
  303. .B pbs_mom
  304. will start up as a standalone
  305. program. For example:
  306. .B pbs_mom -N -s insert <scriptname> <inputfile>
  307. or
  308. .B pbs_mom -N -s list
  309. .RE
  310. .RE
  311. .LP
  312. .IP "--version" 10
  313. The
  314. .B pbs_mom
  315. command returns its PBS version information and exits.
  316. This option can only be used alone.
  317. .SH CONFIGURATION FILES
  318. MoM's configuration information can be contained in configuration
  319. files of three types:
  320. .I default, PBS-prefixed,
  321. and
  322. .I site-defined.
  323. The
  324. default configuration file is usually PBS_HOME/mom_priv/config. The
  325. "PBS" prefix is reserved for files created by PBS. Site-defined
  326. configuration files are those created by the site administrator.
  327. MoM reads the configuration files at startup and reinitialization.
  328. The files are processed in this order:
  329. .br
  330. The default configuration file
  331. .br
  332. PBS-prefixed configuration files
  333. .br
  334. Site-defined configuration files
  335. .br
  336. The contents of a file read later override the contents of a file read earlier.
  337. For example, to change the cpuset flags, create a script "update_flags"
  338. containing only
  339. .RS 4
  340. .B cpuset_create_flags <new flags>
  341. .RE
  342. then use the
  343. .I -s insert
  344. option:
  345. .RS 4
  346. .B pbs_mom -s insert update_script update_flags
  347. .RE
  348. This adds the configuration file "update_script".
  349. Configuration files can be added, deleted and displayed using
  350. the
  351. .I -s
  352. option.
  353. MoM's configuration files can use either the syntax shown
  354. below under
  355. .B Default Syntax and Contents
  356. or the syntax for describing
  357. .I vnodes
  358. shown in
  359. .B Vnode Syntax.
  360. .B Location
  361. .br
  362. The default configuration file is in PBS_HOME/mom_priv/.
  363. PBS places PBS-prefixed and site-defined configuration files
  364. in an area that is private to each installed instance of PBS.
  365. This area is relative to the default PBS_HOME. Note that the
  366. .I -d
  367. option changes where MoM looks for PBS_HOME.
  368. The
  369. .I -c
  370. option will change which default configuration file MoM reads.
  371. Site-defined configuration files can be moved from one installed
  372. instance of PBS to another. Do not move PBS-prefixed configuration
  373. files. To move a set of site-defined configuration files from one
  374. installed instance of PBS to another:
  375. .IP "1" 5
  376. Use the
  377. .I -s list
  378. directive with the "source" instance of PBS to enumerate the
  379. site-defined files.
  380. .IP "2" 5
  381. Use the
  382. .I -s show
  383. directive with each site-defined file of the "source" instance of PBS
  384. to save a copy of that file.
  385. .IP "3" 5
  386. Use the
  387. .I -s insert
  388. directive with each file at the "target" instance of PBS
  389. to create a copy of each site-defined configuration file.
  390. .LP
  391. .B Vnode Configuration File Syntax and Contents
  392. .br
  393. Configuration files with the following syntax describe vnodes and
  394. the resources available on them. They do not contain initialization
  395. values for MoM.
  396. See the
  397. .B PBS Professional Administrator's Guide
  398. for a definition of
  399. .I vnodes.
  400. PBS-prefixed configuration files use the following syntax. Other
  401. configuration files can use the following syntax.
  402. Any configuration file containing vnode-specific assignments must
  403. begin with this line:
  404. .RS 4
  405. .B $configversion 2
  406. .RE
  407. The format a file containing vnode information is:
  408. .RS 4
  409. .I <ID> : <ATTRNAME> = <ATTRVAL>
  410. .RE
  411. where
  412. .RS 4
  413. .IP "<ID>" 12
  414. sequence of characters not including a colon (":")
  415. .IP "<ATTRNAME>" 12
  416. sequence of characters beginning with alphabetics or numerics, which
  417. can contain underscore ("_") and dash ("-")
  418. .IP "<ATTRVAL>" 12
  419. sequence of characters not including an equal sign ("=")
  420. .LP
  421. The colon and equal sign may be surrounded by spaces.
  422. .RE
  423. A vnode's
  424. .I ID
  425. is an identifier that will be unique across all
  426. vnodes known to a given
  427. .B pbs_server
  428. and will be be stable across
  429. reinitializations or invocations of
  430. .B pbs_mom.
  431. ID stability is
  432. of importance when a vnode's CPUs or memory might be expected
  433. to change over time and PBS is expected to adapt to such changes
  434. by resuming suspended jobs on the same vnodes to which they
  435. were originally assigned. Vnodes for which this is not a
  436. consideration may simply use IDs of the form "0", "1", etc.
  437. concatenated with some identifier that ensures uniqueness across
  438. the vnodes served by the
  439. .B pbs_server.
  440. A
  441. .I natural vnode
  442. does not correspond to any actual hardware. It is used to define
  443. any placement set information that is invariant for a given host,
  444. such as pnames.
  445. It is defined as
  446. follows:
  447. .br
  448. .IP "" 5
  449. The name of the natural vnode is, by convention,
  450. the MoM contact name, which is usually the hostname.
  451. The MoM contact name is the vnode's MoM attribute. See the
  452. .B pbs_node_attributes(7B) man page.
  453. .IP "" 5
  454. An attribute, "pnames", with value set to the list of
  455. resource names that define the placement sets' types for
  456. this machine.
  457. .IP "" 5
  458. An attribute, "sharing" is set to the value "force_shared"
  459. .LP
  460. The
  461. .I natural vnode
  462. is used to define any placement set information that is invariant for
  463. a given host (e.g. the placement set resource names themselves).
  464. The order of the
  465. .I pnames
  466. attribute follows placement set organization. If
  467. name X appears to the left of name Y in this attribute's value, an
  468. entity of type X may be assumed to be smaller (that is, be
  469. capable of containing fewer vnodes) than one of type Y. No such
  470. guarantee is made for specific instances of the types.
  471. For example, on an HPE SGI machine named "HostA", with two vnodes, a natural
  472. vnode, four processors and two cbricks, the description would
  473. look like this:
  474. .br
  475. HostA: pnames = cbrick
  476. .br
  477. HostA: sharing = force_shared
  478. .br
  479. HostA[001c02#0]: sharing = default_excl
  480. .br
  481. HostA[001c02#0]: resources_available.ncpus = 2
  482. .br
  483. HostA[001c02#0]: resources_available.cbrick = cbrick-0
  484. .br
  485. HostA[001c02#0]: resources_available.mem = 1968448kb
  486. .br
  487. HostA[001c04#0]: sharing = default_excl
  488. .br
  489. HostA[001c04#0]: resources_available.ncpus = 2
  490. .br
  491. HostA[001c04#0]: resources_available.cbrick = cbrick-1
  492. .br
  493. HostA[001c04#0]: resources_available.mem = 1961328kb
  494. .br
  495. The natural vnode is described in the first two lines.
  496. The first vnode uses cbrick-0, and the second one uses cbrick-1.
  497. .B Default Syntax and Contents
  498. .br
  499. Configuration files with this syntax list local resources and
  500. initialization values for MoM. Local resources are either static,
  501. listed by name and value, or externally-provided, listed by name and
  502. command path. See the
  503. .I -c
  504. option.
  505. Each configuration item is listed on a single line, with its parts
  506. separated by white space. Comments begin with a hashmark ("#").
  507. The default configuration file must be secure. It must be owned by a user ID
  508. and group ID both less than 10 and must not be world-writable.
  509. .B Externally-provided Resources
  510. .br
  511. Externally-provided resources use a shell escape to run a command.
  512. These resources are described with a name and value,
  513. where the first character of the value is an exclamation mark ("!").
  514. The remainder of the value is the path and command to execute.
  515. Parameters in the command beginning with a percent sign ("%") can
  516. be replaced when the command is executed.
  517. For example, this line in a configuration file describes a
  518. resource named "escape":
  519. .RS 14
  520. escape !echo 0xx %yyy
  521. .RE
  522. .IP
  523. If a query for the "escape" resource is sent with no parameter replacements,
  524. the command executed would be "echo 0xx %yyy". If one parameter replacement is sent,
  525. "escape[xxx=hi there]", the command executed would be "echo hi there %yyy".
  526. If two parameter replacements are sent, "escape[xxx=hi][yyy=there]", the command
  527. executed would be "echo hi there". If a parameter replacement is sent with
  528. no matching token in the command line, "escape[zzz=snafu]", an error
  529. is reported.
  530. .LP
  531. .B Windows Notes
  532. .br
  533. If the argument to a MoM option is a pathname containing a space,
  534. enclose it in double quotes as in the following:
  535. hostn !"\\Program Files\\PBS Pro\\exec\\bin\\hostn" host
  536. When you edit any PBS configuration file, make sure that you put a
  537. newline at the end of the file. The Notepad application does not
  538. automatically add a newline at the end of a file; you must explicitly
  539. add the newline.
  540. .B Replacing Actions
  541. .br
  542. .IP "$action <default action> <timeout> <new action>" 5
  543. Replaces the
  544. .I default action
  545. for an event with the site-specified
  546. .I new action.
  547. .I timeout
  548. is the time allowed for
  549. .I new action
  550. to run. See
  551. .B The PBS Professional Administrator's Guide.
  552. The
  553. .I default action
  554. can be one of:
  555. .RS
  556. .IP "checkpoint" 5
  557. Run
  558. .I new action
  559. in place of the periodic job checkpoint, after which the job
  560. continues to run.
  561. .IP "checkpoint_abort" 5
  562. Run
  563. .I new action
  564. to checkpoint the job, after which the job must be terminated by the script.
  565. .IP "multinodebusy <timeout> requeue" 5
  566. Used with cycle harvesting and multi-vnode jobs.
  567. Changes default behavior when a vnode becomes busy. Instead of
  568. allowing the job to run, the job is requeued.
  569. .I timeout
  570. is ignored. The only
  571. .I new action
  572. is
  573. .I requeue.
  574. .IP "restart" 5
  575. Runs
  576. .I new action
  577. in place of
  578. .I restart.
  579. .IP "terminate" 5
  580. Runs
  581. .I new action
  582. in place of SIGTERM or SIGKILL when MoM terminates a job.
  583. .RE
  584. .SH Initialization Values
  585. Initialization value directives have names beginning with a
  586. dollar sign ("$").
  587. See
  588. .B The PBS Professional Administrator's Guide.
  589. .IP "$alps_client <path>" 5
  590. Cray only. MoM runs this command to get the ALPS inventory. Must
  591. be full path to command.
  592. .br
  593. Format: path to command
  594. .br
  595. Default: None
  596. .IP "$alps_release_timeout <timeout>" 5
  597. Cray only. Specifies the amount of time that PBS tries to release an
  598. ALPS reservation before giving up. After this amount of time has
  599. passed, PBS stops trying to release the ALPS reservation, the job
  600. exits, and the job's rsources are released. PBS sends a HUP to the
  601. MoM so that she re-reads the ALPS inventory to get the current
  602. available ALPS resources.
  603. .br
  604. We recommend that the value for this parameter be greater than the value for
  605. .I suspectbegin.
  606. .br
  607. Format: Seconds, specified as positive integer.
  608. .br
  609. Default: 600 (10 minutes)
  610. .IP "$checkpoint_path <path>" 5
  611. MoM passes this path to checkpoint and restart scripts.
  612. This path can be absolute or relative to PBS_HOME/mom_priv.
  613. Overrides default. Overridden by
  614. .I pbs_mom -C
  615. option and by
  616. .I PBS_CHECKPOINT_PATH
  617. environment variable.
  618. .IP "$clienthost <hostname>" 5
  619. .I hostname
  620. is added to the list of hosts which are allowed
  621. to connect to MoM as long as they are using a privileged port.
  622. For example,
  623. this allows the hosts "fred" and "wilma"
  624. to connect to MoM:
  625. .br
  626. "$clienthost fred"
  627. .br
  628. "$clienthost wilma"
  629. .br
  630. The following hostnames are added to
  631. .I $clienthost
  632. automatically: the
  633. server, the localhost, and if configured, the secondary server. The
  634. server sends each MoM a list of the hosts in the nodes file, and these
  635. are added internally to
  636. .I $clienthost.
  637. None of these hostnames need to
  638. be listed in the configuration file.
  639. Two hostnames are always allowed to connect to
  640. .B pbs_mom,
  641. "localhost" and the name returned to MoM
  642. by the system call gethostname(). These
  643. hostnames do not need to be listed in the configuration file.
  644. The hosts listed
  645. as "clienthosts" make up a "sisterhood" of machines. Any one of the
  646. sisterhood will accept connections from within the
  647. sisterhood. The sisterhood must all use the same port number.
  648. .IP "$cpuset_error_action <action>" 5
  649. When using a cpuset-enabled MoM, specifies the action taken when
  650. a cpuset creation error occurs. Can take one of the following values:
  651. .RS 5
  652. .IP continue 3
  653. The error is logged and the job is killed and requeued.
  654. .IP offline 3
  655. The vnodes on this host for this job are marked
  656. .I offline,
  657. and the job is requeued.
  658. .LP
  659. .br
  660. Format: String
  661. .br
  662. Allowable values: "continue", "offline"
  663. .br
  664. Default: "offline"
  665. .RE
  666. .IP "$cputmult <factor>" 5
  667. This sets a
  668. .I factor
  669. used to adjust CPU time used by each job. This allows adjustment of time
  670. charged and limits enforced where jobs run on a system with
  671. different CPU performance. If MoM's system is faster than the
  672. reference system, set
  673. .I factor
  674. to a decimal value greater than 1.0. For example:
  675. .RS 9
  676. $cputmult 1.5
  677. .RE
  678. .IP
  679. If MoM's system is slower, set
  680. .I factor
  681. to a value between 1.0 and 0.0. For example:
  682. .RS 9
  683. $cputmult 0.75
  684. .RE
  685. .IP
  686. .IP "$dce_refresh_delta <delta>" 5
  687. Defines the number of seconds between successive refreshings of a job's
  688. DCE login context.
  689. For example:
  690. .RS 9
  691. $dce_refresh_delta 18000
  692. .RE
  693. .IP
  694. .IP "$enforce <limit>" 5
  695. MoM will enforce the given
  696. .I limit.
  697. Some
  698. .I limits
  699. have associated values. Syntax:
  700. .br
  701. .I $enforce <variable name> <value>
  702. .br
  703. See
  704. .B The PBS Professional Administrator's Guide.
  705. .RS
  706. .IP "$enforce mem" 5
  707. MoM will enforce each job's memory limit.
  708. .IP "$enforce cpuaverage" 5
  709. MoM will enforce ncpus when the average CPU usage over a job's
  710. lifetime usage is greater than the job's limit.
  711. .RS
  712. .IP "$enforce average_trialperiod <seconds>" 5
  713. Modifies
  714. .I cpuaverage.
  715. Minimum number of
  716. .I seconds
  717. of job walltime before enforcement begins. Default: 120.
  718. Integer.
  719. .IP "$enforce average_percent_over <percentage>" 5
  720. Modifies
  721. .I cpuaverage.
  722. Gives
  723. .I percentage
  724. by which a job may exceed its ncpus limit. Default: 50.
  725. Integer.
  726. .IP "$enforce average_cpufactor <factor>" 5
  727. Modifies
  728. .I cpuaverage.
  729. The ncpus limit is multiplied by
  730. .I factor
  731. to produce actual
  732. limit. Default: 1.025. Float.
  733. .RE
  734. .IP "$enforce cpuburst" 5
  735. MoM will enforce the ncpus limit when CPU burst usage exceeds
  736. the job's limit.
  737. .RS
  738. .IP "$enforce delta_percent_over <percentage>" 5
  739. Modifies
  740. .I cpuburst.
  741. Gives
  742. .I percentage
  743. over limit to be allowed. Default: 50. Integer.
  744. .IP "$enforce delta_cpufactor <factor>" 5
  745. Modifies
  746. .I cpuburst.
  747. The ncpus limit is multiplied by
  748. .I factor
  749. to produce actual limit. Default: 1.5. Float.
  750. .IP "$enforce delta_weightup <factor>" 5
  751. Modifies
  752. .I cpuburst.
  753. Weighting factor for smoothing burst usage when average is increasing. Default: 0.4.
  754. Float.
  755. .IP "$enforce delta_weightdown <factor>" 5
  756. Modifies
  757. .I cpuburst.
  758. Weighting factor
  759. for smoothing burst usage when average is decreasing. Default: 0.4.
  760. Float.
  761. .RE
  762. .RE
  763. .IP "$ideal_load <load>" 5
  764. Defines the
  765. .I load
  766. below which the vnode is not considered to be busy.
  767. Used with
  768. the
  769. .I $max_load
  770. directive.
  771. No default. Float.
  772. .RS
  773. .IP "Example:" 5
  774. $ideal_load 1.8
  775. .LP
  776. .br
  777. Use of $ideal_load adds a static resource to the vnode called "ideal_load",
  778. which is only internally visible.
  779. .LP
  780. .RE
  781. .IP "$jobdir_root <stage directory root>
  782. Directory under which PBS creates job-specific staging and execution directories.
  783. PBS creates a job's staging and execution directory when the job's
  784. .I sandbox
  785. attribute is set to PRIVATE. If
  786. .I $jobdir_root
  787. is unset, it defaults to the job owner's home directory.
  788. In this case the user's home directory must exist.
  789. If
  790. .I stage_directory_root
  791. does not exist when MoM starts up, MoM will abort. If
  792. .I stage directory root
  793. does not exist when MoM tries to run a job, MoM will kill the job.
  794. Path must be owned by root, and permissions must be 1777. On Windows,
  795. this directory should have Full Control Permission for the local
  796. Administrators group.
  797. .RS
  798. .IP "Example:" 5
  799. $jobdir_root /scratch/foo
  800. .RE
  801. .IP "$kbd_idle <idle wait> <min use> <poll interval>" 5
  802. Declares that the vnode will be used for batch jobs during periods when
  803. the keyboard and mouse are not in use.
  804. The vnode must be idle for a minimum of
  805. .I idle wait
  806. seconds before being considered available for batch jobs.
  807. No default. Integer.
  808. The vnode must be in use for a minimum of
  809. .I min use
  810. seconds before it becomes unavailable for batch jobs. Default: 10. Integer.
  811. Mom checks for activity every
  812. .I poll interval
  813. seconds. Default: 1. Integer.
  814. .RS
  815. .IP "Example:" 5
  816. $kbd_idle 1800 10 5
  817. .RE
  818. .IP "$logevent <mask>" 5
  819. Sets the
  820. .I mask
  821. that determines which event types are logged by
  822. .B pbs_mom.
  823. To include all debug events, use 0xffffffff.
  824. .nf
  825. Log events:
  826. Name Hex Value Message Category
  827. ---------------------------------------------------
  828. ERROR 0001 Internal errors
  829. SYSTEM 0002 System errors
  830. ADMIN 0004 Administrative events
  831. JOB 0008 Job-related events
  832. JOB_USAGE 0010 Job accounting info
  833. SECURITY 0020 Security violations
  834. SCHED 0040 Scheduler events
  835. DEBUG 0080 Common debug messages
  836. DEBUG2 0100 Uncommon debug messages
  837. RESV 0200 Reservation-related info
  838. DEBUG3 0400 Rare debug messages
  839. DEBUG4 0800 Limit-related messages
  840. .fi
  841. .IP "$max_check_poll <seconds>" 5
  842. Maximum time between polling cycles, in seconds. Minimum recommended
  843. value: 30 seconds. See the
  844. .B PBS Professional Administrator's Guide
  845. for usage.
  846. The interval between each poll starts at
  847. .I $min_check_poll
  848. and increases with each cycle until it reaches
  849. .I $max_check_poll,
  850. after which it remains the same. The amount by which the cycle increases is 1/20 of
  851. the difference between
  852. .I $max_check_poll
  853. and
  854. .I $min_check_poll.
  855. .br
  856. Format: Integer
  857. .br
  858. Minimum value: 1 second
  859. .br
  860. Default value: 120 seconds
  861. .IP "$max_load <load> [suspend]" 5
  862. Defines the load above which the vnode is considered to be busy.
  863. Used with
  864. the
  865. .I $ideal_load
  866. directive. No new jobs are started on a
  867. .I busy
  868. vnode.
  869. The optional
  870. .I suspend
  871. directive tells PBS to suspend jobs running on
  872. the vnode if the load average exceeds the
  873. .I $ max_load
  874. number, regardless of the source of the load (PBS and/or logged-in users).
  875. Without this directive, PBS will not suspend jobs due to load.
  876. We recommend setting
  877. .I load
  878. to a value that is slightly higher than the number of CPUs,
  879. for example 0.25 +
  880. .I ncpus.
  881. .br
  882. Default: number of CPUs on machine
  883. .br
  884. Format: Float
  885. .br
  886. Example:
  887. .RS 8
  888. $max_load 3.5
  889. .RE
  890. .IP
  891. .IP "$min_check_poll <seconds>" 5
  892. Minimum time between polling cycles, in seconds. Must be
  893. greater than zero and less than
  894. .I $max_check_poll.
  895. Minimum recommended value: 10 seconds.
  896. .br
  897. Format: Integer.
  898. .br
  899. Minimum value: 1 second
  900. .br
  901. Default value: 10 seconds
  902. .IP "$prologalarm <timeout>" 5
  903. Defines the maximum number of seconds the prologue and epilogue
  904. may run before timing out. Default: 30 seconds. Integer.
  905. Example:
  906. .RS 8
  907. $prologalarm 30
  908. .RE
  909. .IP
  910. .IP "$reject_root_scripts <True | False>" 5
  911. When set to
  912. .I True,
  913. MoM won't aquire any new hook scripts, and MoM won't run job scripts that would execute
  914. as root or Admin. However, MoM will run previously-aquired hooks that run as root.
  915. .br
  916. Format: Boolean
  917. .br
  918. Default: False
  919. .IP "$restart_background <True | False>" 5
  920. Controls how MoM runs a restart script after checkpointing a job.
  921. When this option is set to
  922. .I True,
  923. MoM forks a child which runs the restart script. The child returns
  924. when all restarts for all the local tasks of the job are done. MoM
  925. does not block on the restart. When this option is set to
  926. .I False,
  927. MoM runs the restart script and waits for the result.
  928. .br
  929. Format: Boolean
  930. .br
  931. Default: False
  932. .IP "$restart_transmogrify <True | False>" 5
  933. Controls how MoM runs a restart script after checkpointing a job.
  934. When this option is set to
  935. .I True,
  936. MoM runs the restart script, replacing the session ID of the original
  937. task's top process with the session ID of the script.
  938. When this option is set to
  939. .I False,
  940. MoM runs the restart script and waits for the result. The restart
  941. script must restore the original session ID for all the processes of
  942. each task so that MoM can continue to track the job.
  943. When this option is set to
  944. .I False
  945. and the restart uses an external command, the configuration parameter
  946. .I restart_background
  947. is ignored and treated as if it were set to
  948. .I True,
  949. preventing MoM from blocking on the restart.
  950. .br
  951. Format: Boolean
  952. .br
  953. Default: False
  954. .br
  955. .IP "$restrict_user <True | False>" 5
  956. Controls whether users not submitting jobs have access to this
  957. machine. If
  958. .I value
  959. is
  960. .I True,
  961. restrictions are applied. See
  962. .I $restrict_user_exceptions
  963. and
  964. .I $restrict_user_maxsysid.
  965. Not supported on Windows.
  966. .br
  967. Format: Boolean
  968. .br
  969. Default: False
  970. .IP "$restrict_user_exceptions <user_list>" 5
  971. Comma-separated list of users who are exempt from access
  972. restrictions applied by
  973. .I $restrict_user.
  974. Leading spaces within each entry are allowed.
  975. Maximum of 10 names.
  976. .IP "$restrict_user_maxsysid <value>" 5
  977. Any user with a numeric user ID less than or equal to
  978. .I value
  979. is exempt from restrictions applied by
  980. $restrict_user.
  981. If
  982. .I $restrict_user
  983. is
  984. .I True
  985. and no
  986. .I value
  987. exists for
  988. .I $restrict_user_maxsysid,
  989. PBS looks in /etc/login.defs, if it exists, for the
  990. .I value.
  991. Otherwise the default is used.
  992. Integer. Default: 999
  993. .IP "$restricted <hostname>" 5
  994. The
  995. .I hostname
  996. is added to the list of hosts which are allowed to connect to MoM
  997. without being required to use a privileged port.
  998. Hostnames can be
  999. wildcarded. For example, to allow queries from any host from the
  1000. domain "xyz.com":
  1001. .RS 9
  1002. $restricted *.xyz.com
  1003. .RE
  1004. .IP
  1005. Queries from the hosts in the restricted list are only allowed
  1006. access to information internal to this host, such as load
  1007. average, memory available, etc. They may not run shell commands.
  1008. .IP "$suspendsig <suspend signal> [resume signal]" 5
  1009. Alternate signal
  1010. .I suspend signal
  1011. is used to suspend jobs instead of SIGSTOP. Optional
  1012. .I resume signal
  1013. is used to resume jobs instead of SIGCONT.
  1014. .IP "$tmpdir <directory>" 5
  1015. Location where each job's scratch directory will be created.
  1016. PBS creats a temporary directory for use by the job, not by PBS.
  1017. PBS creates the directory before the the job is run and removes
  1018. the directory and its contents when the job is finished. It is
  1019. scratch space for use by the job. Permission must be 1777 on
  1020. Linux, writable by
  1021. .I Everyone
  1022. on Windows.
  1023. Example:
  1024. .RS 9
  1025. $tmpdir /memfs
  1026. .RE
  1027. .IP
  1028. Default on Linux: /var/tmp
  1029. .br
  1030. Default on Windows: value of the
  1031. .I TMP
  1032. environment variable
  1033. .IP "$usecp <hostname:source prefix> <destination prefix>" 5
  1034. MoM uses /bin/cp to deliver output files when
  1035. the destination is a network mounted file system, or when
  1036. the source and destination are both on the local host, or when
  1037. the
  1038. .I source prefix
  1039. can be replaced with the
  1040. .I destination prefix
  1041. on
  1042. .I hostname.
  1043. Both
  1044. .I source prefix
  1045. and
  1046. .I destination prefix
  1047. are absolute pathnames of directories, not files.
  1048. Overrides
  1049. .I PBS_RCP
  1050. and
  1051. .I PBS_SCP.
  1052. Use trailing
  1053. slashes on both source and destination. For example:
  1054. .RS 9
  1055. $usecp HostA:/users/work/myproj/ /sharedwork/proj_results/
  1056. .RE
  1057. .IP
  1058. .IP "$vnodedef_additive" 5
  1059. Specifies whether MoM considers a vnode that appeared previously
  1060. either in the inventory or in a vnode definition file, but that does
  1061. not appear now, to be in her list of vnodes.
  1062. .br
  1063. When
  1064. .I $vnodedef_additive
  1065. is True, MoM treats missing vnodes as if they
  1066. are still present, and continues to report them as if they are
  1067. present. This means that the server does not mark missing vnodes as
  1068. .I stale.
  1069. .br
  1070. When
  1071. .I $vnodedef_additive
  1072. is False, MoM does not list missing vnodes,
  1073. the server's information is brought up to date with the inventory and
  1074. vnode definition files, and the server marks missing vnodes as
  1075. .I stale.
  1076. .br
  1077. Visible in configuration file on Cray only.
  1078. .br
  1079. Format: Boolean
  1080. .br
  1081. Default for MoM on Cray login node: False
  1082. .IP "$wallmult <factor>" 5
  1083. Each job's walltime usage is multiplied by
  1084. .I factor.
  1085. For example:
  1086. .RS 9
  1087. $wallmult 1.5
  1088. .RE
  1089. .IP
  1090. .RE
  1091. .B Cray-only Initialization Values
  1092. .br
  1093. .IP "pbs_accounting_workload_mgmt <value>" 5
  1094. Controls whether CSA accounting is enabled. Name does not start
  1095. with dollar sign. If set to "1", "on", or "true",
  1096. CSA accounting is enabled. If set to "0", "off", or
  1097. "false", accounting is disabled. Default: "true"; enabled.
  1098. .RE
  1099. .B HPE SGI-only Initialization Values
  1100. .br
  1101. .IP "cpuset_create_flags <flags>" 5
  1102. Lists the flags for when MoM does a cpusetCreate(3) for each job.
  1103. .I flags
  1104. is an or-ed list of flags.
  1105. The flags are:
  1106. .RS
  1107. .IP "HPE SGI formerly known as ICE, with Performance Software - Message Passing Interface" 5
  1108. CPUSET_CPU_EXCLUSIVE
  1109. .br
  1110. 0
  1111. .br
  1112. Default: 0
  1113. .LP
  1114. .RE
  1115. .IP "cpuset_destroy_delay <delay>" 5
  1116. MoM waits up to
  1117. .I delay
  1118. seconds before destroying a cpuset of
  1119. a just-completed job, but not longer than necessary.
  1120. This gives the operating system more time to clean up leftover processes
  1121. after they have been killed.
  1122. Example:
  1123. .RS 9
  1124. cpuset_destroy_delay 10
  1125. .RE
  1126. .IP
  1127. Default for HPE SGI machines: 0
  1128. .br
  1129. Format: Integer
  1130. .IP "memreserved <megabytes>" 5
  1131. .B Deprecated.
  1132. The amount of per-vnode memory reserved for system overhead.
  1133. This much memory is deducted from the value of
  1134. .I resources_available.mem
  1135. for each vnode managed by this MoM.
  1136. .br
  1137. Example:
  1138. .RS 9
  1139. memreserved 16
  1140. .RE
  1141. .IP
  1142. Default: 0MB.
  1143. .br
  1144. .RE
  1145. .B Static Resources
  1146. .br
  1147. Static resources local to the vnode are described
  1148. one resource to a line,
  1149. with a name and value separated by white space.
  1150. For example, tape drives of different types could be specified by:
  1151. .RS 15
  1152. .nf
  1153. .B tape3480 \ \ 4
  1154. .B tape3420 \ \ 2
  1155. .B tapedat \ \ \ \ 1
  1156. .B tape8mm \ \ \ \ 1
  1157. .fi
  1158. .RE
  1159. .RE
  1160. .SH FILES AND DIRECTORIES
  1161. .IP $PBS_HOME/mom_priv 10
  1162. Default directory for default configuration files.
  1163. .IP $PBS_HOME/mom_priv/config 10
  1164. MoM's default configuration file.
  1165. .IP $PBS_HOME/mom_logs 10
  1166. Default directory for log files written by MoM.
  1167. .IP $PBS_HOME/mom_priv/prologue 10
  1168. File containing administrative script to be run before job execution.
  1169. .IP $PBS_HOME/mom_priv/epilogue 10
  1170. File containing administrative script to be run after job execution.
  1171. .SH SIGNAL HANDLING
  1172. .B pbs_mom
  1173. handles the following signals:
  1174. .IP SIGHUP 10
  1175. The
  1176. .B pbs_mom
  1177. daemon rereads its configuration files, closes and reopens the log
  1178. file, and reinitializes resource structures.
  1179. .IP SIGALRM 10
  1180. MoM writes a log file entry. See the
  1181. .I -a alarm_timeout
  1182. option.
  1183. .IP SIGINT 10
  1184. The
  1185. .B pbs_mom
  1186. daemon exits, leaving all running jobs still running.
  1187. See the
  1188. .I -p
  1189. option.
  1190. .IP SIGKILL 10
  1191. This signal is not caught. The
  1192. .B pbs_mom
  1193. daemon exits immediately.
  1194. .IP "SIGTERM, SIGXCPU, SIGXFSZ, SIGCPULIM, SIGSHUTDN" 10
  1195. The
  1196. .B pbs_mom
  1197. daemon terminates all running children and exits.
  1198. .IP "SIGPIPE, SIGUSR1, SIGUSR2, SIGINFO" 10
  1199. These are ignored.
  1200. .LP
  1201. All other signals have their default behavior installed.
  1202. .SH EXIT STATUS
  1203. .IP "Greater than zero" 5
  1204. If the
  1205. .B pbs_mom
  1206. daemon fails to start
  1207. .br
  1208. If the
  1209. .I -s insert
  1210. option is used with an existing
  1211. .I scriptname
  1212. .br
  1213. If the administrator attempts to add a script whose name
  1214. begins with "PBS"
  1215. .br
  1216. If the administrator attempts to use the
  1217. .I -s remove
  1218. option on a nonexistent configuration file, or on a configuration
  1219. file whose name begins with "PBS"
  1220. .br
  1221. If the administrator attempts to use the
  1222. .I -s show
  1223. option on a nonexistent script
  1224. .SH SEE ALSO
  1225. The
  1226. .B PBS Professional Administrator's Guide,
  1227. pbs_server(8B),
  1228. pbs_sched(8B),
  1229. qstat(1B)