pbs_snaputils.py 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621
  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. import os
  37. import time
  38. import tarfile
  39. import logging
  40. import socket
  41. from subprocess import STDOUT
  42. from ptl.lib.pbs_testlib import Server, Scheduler, SCHED
  43. from ptl.lib.pbs_ifl_mock import *
  44. from ptl.utils.pbs_dshutils import DshUtils
  45. from ptl.utils.pbs_logutils import PBSLogUtils
  46. from ptl.utils.pbs_anonutils import PBSAnonymizer
  47. # Define an enum which is used to label various pieces of information
  48. ( # qstat outputs
  49. QSTAT_B_OUT,
  50. QSTAT_BF_OUT,
  51. QSTAT_OUT,
  52. QSTAT_F_OUT,
  53. QSTAT_T_OUT,
  54. QSTAT_TF_OUT,
  55. QSTAT_X_OUT,
  56. QSTAT_XF_OUT,
  57. QSTAT_NS_OUT,
  58. QSTAT_FX_DSV_OUT,
  59. QSTAT_F_DSV_OUT,
  60. QSTAT_F_JSON_OUT,
  61. QSTAT_Q_OUT,
  62. QSTAT_QF_OUT,
  63. # qmgr outputs
  64. QMGR_PS_OUT,
  65. QMGR_PH_OUT,
  66. QMGR_LPBSHOOK_OUT,
  67. QMGR_LSCHED_OUT,
  68. QMGR_PN_OUT,
  69. QMGR_PR_OUT,
  70. # pbsnodes outputs
  71. PBSNODES_VA_OUT,
  72. PBSNODES_A_OUT,
  73. PBSNODES_AVSJ_OUT,
  74. PBSNODES_ASJ_OUT,
  75. PBSNODES_AVS_OUT,
  76. PBSNODES_AS_OUT,
  77. PBSNODES_AFDSV_OUT,
  78. PBSNODES_AVFDSV_OUT,
  79. PBSNODES_AVFJSON_OUT,
  80. # pbs_rstat outputs
  81. PBS_RSTAT_OUT,
  82. PBS_RSTAT_F_OUT,
  83. # PBS config related outputs
  84. PBS_CONF,
  85. PBS_PROBE_OUT,
  86. PBS_HOSTN_OUT,
  87. PBS_ENVIRONMENT,
  88. # System related outputs
  89. OS_INFO,
  90. PROCESS_INFO,
  91. LSOF_PBS_OUT,
  92. ETC_HOSTS,
  93. ETC_NSSWITCH_CONF,
  94. VMSTAT_OUT,
  95. DF_H_OUT,
  96. DMESG_OUT,
  97. PS_LEAF_OUT,
  98. # Logs
  99. ACCT_LOGS,
  100. SVR_LOGS,
  101. SCHED_LOGS,
  102. MOM_LOGS,
  103. PG_LOGS,
  104. COMM_LOGS,
  105. # Daemon priv directories
  106. SVR_PRIV,
  107. MOM_PRIV,
  108. SCHED_PRIV,
  109. # Core file information
  110. CORE_SCHED,
  111. CORE_SERVER,
  112. CORE_MOM,
  113. # Miscellaneous
  114. CTIME) = range(57)
  115. # Define paths to various files/directories with respect to the snapshot
  116. # server/
  117. SERVER_DIR = "server"
  118. QSTAT_B_PATH = os.path.join(SERVER_DIR, "qstat_B.out")
  119. QSTAT_BF_PATH = os.path.join(SERVER_DIR, "qstat_Bf.out")
  120. QMGR_PS_PATH = os.path.join(SERVER_DIR, "qmgr_ps.out")
  121. QSTAT_Q_PATH = os.path.join(SERVER_DIR, "qstat_Q.out")
  122. QSTAT_QF_PATH = os.path.join(SERVER_DIR, "qstat_Qf.out")
  123. QMGR_PR_PATH = os.path.join(SERVER_DIR, "qmgr_pr.out")
  124. # server_priv/
  125. SVR_PRIV_PATH = "server_priv"
  126. ACCT_LOGS_PATH = os.path.join("server_priv", "accounting")
  127. # server_logs/
  128. SVR_LOGS_PATH = "server_logs"
  129. # job/
  130. JOB_DIR = "job"
  131. QSTAT_PATH = os.path.join(JOB_DIR, "qstat.out")
  132. QSTAT_F_PATH = os.path.join(JOB_DIR, "qstat_f.out")
  133. QSTAT_T_PATH = os.path.join(JOB_DIR, "qstat_t.out")
  134. QSTAT_TF_PATH = os.path.join(JOB_DIR, "qstat_tf.out")
  135. QSTAT_X_PATH = os.path.join(JOB_DIR, "qstat_x.out")
  136. QSTAT_XF_PATH = os.path.join(JOB_DIR, "qstat_xf.out")
  137. QSTAT_NS_PATH = os.path.join(JOB_DIR, "qstat_ns.out")
  138. QSTAT_FX_DSV_PATH = os.path.join(JOB_DIR, "qstat_fx_F_dsv.out")
  139. QSTAT_F_DSV_PATH = os.path.join(JOB_DIR, "qstat_f_F_dsv.out")
  140. QSTAT_F_JSON_PATH = os.path.join(JOB_DIR, "qstat_f_F_json.out")
  141. # node/
  142. NODE_DIR = "node"
  143. PBSNODES_VA_PATH = os.path.join(NODE_DIR, "pbsnodes_va.out")
  144. PBSNODES_A_PATH = os.path.join(NODE_DIR, "pbsnodes_a.out")
  145. PBSNODES_AVSJ_PATH = os.path.join(NODE_DIR, "pbsnodes_avSj.out")
  146. PBSNODES_ASJ_PATH = os.path.join(NODE_DIR, "pbsnodes_aSj.out")
  147. PBSNODES_AVS_PATH = os.path.join(NODE_DIR, "pbsnodes_avS.out")
  148. PBSNODES_AS_PATH = os.path.join(NODE_DIR, "pbsnodes_aS.out")
  149. PBSNODES_AFDSV_PATH = os.path.join(NODE_DIR, "pbsnodes_aFdsv.out")
  150. PBSNODES_AVFDSV_PATH = os.path.join(NODE_DIR, "pbsnodes_avFdsv.out")
  151. PBSNODES_AVFJSON_PATH = os.path.join(NODE_DIR, "pbsnodes_avFjson.out")
  152. QMGR_PN_PATH = os.path.join(NODE_DIR, "qmgr_pn_default.out")
  153. # mom_priv/
  154. MOM_PRIV_PATH = "mom_priv"
  155. # mom_logs/
  156. MOM_LOGS_PATH = "mom_logs"
  157. # comm_logs/
  158. COMM_LOGS_PATH = "comm_logs"
  159. # hook/
  160. HOOK_DIR = "hook"
  161. QMGR_PH_PATH = os.path.join(HOOK_DIR, "qmgr_ph_default.out")
  162. QMGR_LPBSHOOK_PATH = os.path.join(HOOK_DIR, "qmgr_lpbshook.out")
  163. # scheduler/
  164. SCHED_DIR = "scheduler"
  165. QMGR_LSCHED_PATH = os.path.join(SCHED_DIR, "qmgr_lsched.out")
  166. # sched_priv/
  167. DFLT_SCHED_PRIV_PATH = "sched_priv"
  168. # sched_logs/
  169. DFLT_SCHED_LOGS_PATH = "sched_logs"
  170. # reservation/
  171. RESV_DIR = "reservation"
  172. PBS_RSTAT_PATH = os.path.join(RESV_DIR, "pbs_rstat.out")
  173. PBS_RSTAT_F_PATH = os.path.join(RESV_DIR, "pbs_rstat_f.out")
  174. # datastore/
  175. DATASTORE_DIR = "datastore"
  176. PG_LOGS_PATH = os.path.join(DATASTORE_DIR, "pg_log")
  177. # core_file_bt/
  178. CORE_DIR = "core_file_bt"
  179. CORE_SERVER_PATH = os.path.join(CORE_DIR, "server_priv")
  180. CORE_SCHED_PATH = os.path.join(CORE_DIR, "sched_priv")
  181. CORE_MOM_PATH = os.path.join(CORE_DIR, "mom_priv")
  182. # system/
  183. SYS_DIR = "system"
  184. PBS_PROBE_PATH = os.path.join(SYS_DIR, "pbs_probe_v.out")
  185. PBS_HOSTN_PATH = os.path.join(SYS_DIR, "pbs_hostn_v.out")
  186. PBS_ENV_PATH = os.path.join(SYS_DIR, "pbs_environment")
  187. OS_PATH = os.path.join(SYS_DIR, "os_info")
  188. PROCESS_PATH = os.path.join(SYS_DIR, "process_info")
  189. ETC_HOSTS_PATH = os.path.join(SYS_DIR, "etc_hosts")
  190. ETC_NSSWITCH_PATH = os.path.join(SYS_DIR, "etc_nsswitch_conf")
  191. LSOF_PBS_PATH = os.path.join(SYS_DIR, "lsof_pbs.out")
  192. VMSTAT_PATH = os.path.join(SYS_DIR, "vmstat.out")
  193. DF_H_PATH = os.path.join(SYS_DIR, "df_h.out")
  194. DMESG_PATH = os.path.join(SYS_DIR, "dmesg.out")
  195. PS_LEAF_PATH = os.path.join(SYS_DIR, "ps_leaf.out")
  196. # top-level
  197. PBS_CONF_PATH = "pbs.conf"
  198. CTIME_PATH = "ctime"
  199. # Define paths to PBS commands used to capture data with respect to PBS_EXEC
  200. QSTAT_CMD = os.path.join("bin", "qstat")
  201. PBSNODES_CMD = os.path.join("bin", "pbsnodes")
  202. QMGR_CMD = os.path.join("bin", "qmgr")
  203. PBS_RSTAT_CMD = os.path.join("bin", "pbs_rstat")
  204. PBS_PROBE_CMD = os.path.join("sbin", "pbs_probe")
  205. PBS_HOSTN_CMD = os.path.join("bin", "pbs_hostn")
  206. # A global list of files which contain data in tabular form
  207. FILE_TABULAR = ["qstat.out", "qstat_t.out", "qstat_x.out", "qstat_ns.out",
  208. "pbsnodes_aS.out", "pbsnodes_aSj.out", "pbsnodes_avS.out",
  209. "pbsnodes_avSj.out", "qstat_Q.out", "qstat_B.out",
  210. "pbs_rstat.out"]
  211. class PBSSnapUtils(object):
  212. """
  213. Wrapper class around _PBSSnapUtils
  214. This makes sure that we do necessay cleanup before destroying objects
  215. """
  216. def __init__(self, out_dir, primary_host=None, acct_logs=None,
  217. daemon_logs=None, map_file=None, anonymize=None,
  218. create_tar=False, log_path=None, with_sudo=False):
  219. self.out_dir = out_dir
  220. self.primary_host = primary_host
  221. self.acct_logs = acct_logs
  222. self.srvc_logs = daemon_logs
  223. self.map_file = map_file
  224. self.anonymize = anonymize
  225. self.create_tar = create_tar
  226. self.log_path = log_path
  227. self.with_sudo = with_sudo
  228. self.utils_obj = None
  229. def __enter__(self):
  230. self.utils_obj = _PBSSnapUtils(self.out_dir, self.primary_host,
  231. self.acct_logs, self.srvc_logs,
  232. self.map_file, self.anonymize,
  233. self.create_tar, self.log_path,
  234. self.with_sudo)
  235. return self.utils_obj
  236. def __exit__(self, exc_type, exc_value, traceback):
  237. # Do some cleanup
  238. self.utils_obj.finalize()
  239. return False
  240. class _PBSSnapUtils(object):
  241. """
  242. PBS snapshot utilities
  243. """
  244. def __init__(self, out_dir, primary_host=None, acct_logs=None,
  245. daemon_logs=None, map_file=None, anonymize=False,
  246. create_tar=False, log_path=None, with_sudo=False):
  247. """
  248. Initialize a PBSSnapUtils object with the arguments specified
  249. :param out_dir: path to the directory where snapshot will be created
  250. :type out_dir: str
  251. :param primary_host: Name of the primary host to capture
  252. :type primary_host: str or None
  253. :param acct_logs: number of accounting logs to capture
  254. :type acct_logs: int or None
  255. :param daemon_logs: number of daemon logs to capture
  256. :type daemon_logs: int or None
  257. :param map_file: Path to map file for anonymization map
  258. :type map_file str or None
  259. :param anonymize: anonymize data?
  260. :type anonymize: bool
  261. :param create_tar: Create a tarball of the output snapshot?
  262. :type create_tar: bool or None
  263. :param log_path: Path to pbs_snapshot's log file
  264. :type log_path: str or None
  265. :param with_sudo: Capture relevant information with sudo?
  266. :type with_sudo: bool
  267. """
  268. self.logger = logging.getLogger(__name__)
  269. self.du = DshUtils()
  270. self.server_info = {}
  271. self.job_info = {}
  272. self.node_info = {}
  273. self.comm_info = {}
  274. self.hook_info = {}
  275. self.sched_info = {}
  276. self.resv_info = {}
  277. self.sys_info = {}
  278. self.core_info = {}
  279. self.anon_obj = None
  280. self.all_hosts = []
  281. self.server = None
  282. self.mom = None
  283. self.comm = None
  284. self.scheduler = None
  285. self.log_utils = PBSLogUtils()
  286. self.outtar_path = None
  287. self.outtar_fd = None
  288. self.create_tar = create_tar
  289. self.snapshot_name = None
  290. self.with_sudo = with_sudo
  291. self.log_path = log_path
  292. self.server_up = False
  293. self.server_info_avail = False
  294. self.mom_info_avail = False
  295. self.comm_info_avail = False
  296. self.sched_info_avail = False
  297. if self.log_path is not None:
  298. self.log_filename = os.path.basename(self.log_path)
  299. else:
  300. self.log_filename = None
  301. # finalize() is called by the context's __exit__() automatically
  302. # however, finalize() is non-reenterant, so set a flag to keep
  303. # track of whether it has been called or not.
  304. self.finalized = False
  305. # Parse the input arguments
  306. timestamp_str = time.strftime("%Y%m%d_%H_%M_%S")
  307. self.snapshot_name = "snapshot_" + timestamp_str
  308. # Make sure that the target directory exists
  309. dir_path = os.path.abspath(out_dir)
  310. if not os.path.isdir(dir_path):
  311. raise ValueError("Target directory either doesn't exist" +
  312. "or not accessible. Quitting.")
  313. self.snapdir = os.path.join(dir_path, self.snapshot_name)
  314. self.num_acct_logs = int(acct_logs) if acct_logs is not None else 0
  315. if daemon_logs is not None:
  316. self.num_daemon_logs = int(daemon_logs)
  317. else:
  318. self.num_daemon_logs = 0
  319. self.mapfile = map_file
  320. if primary_host is None:
  321. primary_host = socket.gethostname()
  322. # Check which of the PBS daemons' information is available
  323. self.server = Server(primary_host)
  324. self.scheduler = Scheduler(server=self.server)
  325. daemon_status = self.server.pi.status()
  326. if len(daemon_status) > 0 and daemon_status['rc'] == 0 and \
  327. len(daemon_status['err']) == 0:
  328. for d_stat in daemon_status['out']:
  329. if d_stat.startswith("pbs_server"):
  330. self.server_info_avail = True
  331. if "not running" not in d_stat:
  332. self.server_up = True
  333. elif d_stat.startswith("pbs_sched"):
  334. self.sched_info_avail = True
  335. elif d_stat.startswith("pbs_mom"):
  336. self.mom_info_avail = True
  337. elif d_stat.startswith("pbs_comm"):
  338. self.comm_info_avail = True
  339. self.custom_rscs = None
  340. if self.server_up:
  341. self.custom_rscs = self.server.parse_resources()
  342. # Store paths to PBS_HOME and PBS_EXEC
  343. self.pbs_home = self.server.pbs_conf["PBS_HOME"]
  344. self.pbs_exec = self.server.pbs_conf["PBS_EXEC"]
  345. # Add self.primary_host to the list of hosts
  346. self.primary_host = self.server.hostname
  347. # If output needs to be a tarball, create the tarfile name
  348. # tarfile name = <output directory name>.tgz
  349. self.outtar_path = self.snapdir + ".tgz"
  350. # Set up some infrastructure
  351. self.__init_cmd_path_map()
  352. # Create the snapshot directory tree
  353. self.__initialize_snapshot()
  354. # Create a PBSAnonymizer object
  355. self.anonymize = anonymize
  356. if self.anonymize:
  357. del_attrs = [ATTR_v, ATTR_e, ATTR_mailfrom, ATTR_m, ATTR_name,
  358. ATTR_jobdir, ATTR_submit_arguments, ATTR_o, ATTR_S]
  359. obf_attrs = [ATTR_euser, ATTR_egroup, ATTR_project, ATTR_A,
  360. ATTR_operators, ATTR_managers, ATTR_g, ATTR_M,
  361. ATTR_u, ATTR_SvrHost, ATTR_aclgroup, ATTR_acluser,
  362. ATTR_aclResvgroup, ATTR_aclResvuser, ATTR_SchedHost,
  363. ATTR_aclResvhost, ATTR_aclhost, ATTR_owner,
  364. ATTR_exechost, ATTR_NODE_Host, ATTR_NODE_Mom,
  365. ATTR_rescavail + ".host", ATTR_rescavail + ".vnode",
  366. ATTR_auth_u, ATTR_auth_g, ATTR_resv_owner]
  367. obf_rsc_attrs = []
  368. if self.custom_rscs is not None:
  369. for rsc in self.custom_rscs.keys():
  370. obf_rsc_attrs.append(rsc)
  371. self.anon_obj = PBSAnonymizer(attr_delete=del_attrs,
  372. attr_val=obf_attrs,
  373. resc_key=obf_rsc_attrs)
  374. def __init_cmd_path_map(self):
  375. """
  376. Fill in various dicts which map the commands used for capturing
  377. various classes of outputs along with the paths to the files where
  378. they will be stored inside the snapshot as a tuple.
  379. """
  380. if self.server_up:
  381. # Server information
  382. value = (QSTAT_B_PATH, [QSTAT_CMD, "-B"])
  383. self.server_info[QSTAT_B_OUT] = value
  384. value = (QSTAT_BF_PATH, [QSTAT_CMD, "-Bf"])
  385. self.server_info[QSTAT_BF_OUT] = value
  386. value = (QMGR_PS_PATH, [QMGR_CMD, "-c", "p s"])
  387. self.server_info[QMGR_PS_OUT] = value
  388. value = (QSTAT_Q_PATH, [QSTAT_CMD, "-Q"])
  389. self.server_info[QSTAT_Q_OUT] = value
  390. value = (QSTAT_QF_PATH, [QSTAT_CMD, "-Qf"])
  391. self.server_info[QSTAT_QF_OUT] = value
  392. value = (QMGR_PR_PATH, [QMGR_CMD, "-c", "p r"])
  393. self.server_info[QMGR_PR_OUT] = value
  394. # Job information
  395. value = (QSTAT_PATH, [QSTAT_CMD])
  396. self.job_info[QSTAT_OUT] = value
  397. value = (QSTAT_F_PATH, [QSTAT_CMD, "-f"])
  398. self.job_info[QSTAT_F_OUT] = value
  399. value = (QSTAT_T_PATH, [QSTAT_CMD, "-t"])
  400. self.job_info[QSTAT_T_OUT] = value
  401. value = (QSTAT_TF_PATH, [QSTAT_CMD, "-tf"])
  402. self.job_info[QSTAT_TF_OUT] = value
  403. value = (QSTAT_X_PATH, [QSTAT_CMD, "-x"])
  404. self.job_info[QSTAT_X_OUT] = value
  405. value = (QSTAT_XF_PATH, [QSTAT_CMD, "-xf"])
  406. self.job_info[QSTAT_XF_OUT] = value
  407. value = (QSTAT_NS_PATH, [QSTAT_CMD, "-ns"])
  408. self.job_info[QSTAT_NS_OUT] = value
  409. value = (QSTAT_FX_DSV_PATH, [QSTAT_CMD, "-fx", "-F", "dsv"])
  410. self.job_info[QSTAT_FX_DSV_OUT] = value
  411. value = (QSTAT_F_DSV_PATH, [QSTAT_CMD, "-f", "-F", "dsv"])
  412. self.job_info[QSTAT_F_DSV_OUT] = value
  413. value = (QSTAT_F_JSON_PATH, [QSTAT_CMD, "-f", "-F", "json"])
  414. self.job_info[QSTAT_F_JSON_OUT] = value
  415. # Node information
  416. value = (PBSNODES_VA_PATH, [PBSNODES_CMD, "-va"])
  417. self.node_info[PBSNODES_VA_OUT] = value
  418. value = (PBSNODES_A_PATH, [PBSNODES_CMD, "-a"])
  419. self.node_info[PBSNODES_A_OUT] = value
  420. value = (PBSNODES_AVSJ_PATH, [PBSNODES_CMD, "-avSj"])
  421. self.node_info[PBSNODES_AVSJ_OUT] = value
  422. value = (PBSNODES_ASJ_PATH, [PBSNODES_CMD, "-aSj"])
  423. self.node_info[PBSNODES_ASJ_OUT] = value
  424. value = (PBSNODES_AVS_PATH, [PBSNODES_CMD, "-avS"])
  425. self.node_info[PBSNODES_AVS_OUT] = value
  426. value = (PBSNODES_AS_PATH, [PBSNODES_CMD, "-aS"])
  427. self.node_info[PBSNODES_AS_OUT] = value
  428. value = (PBSNODES_AFDSV_PATH, [PBSNODES_CMD, "-aFdsv"])
  429. self.node_info[PBSNODES_AFDSV_OUT] = value
  430. value = (PBSNODES_AVFDSV_PATH, [PBSNODES_CMD, "-avFdsv"])
  431. self.node_info[PBSNODES_AVFDSV_OUT] = value
  432. value = (PBSNODES_AVFJSON_PATH, [PBSNODES_CMD, "-avFjson"])
  433. self.node_info[PBSNODES_AVFJSON_OUT] = value
  434. value = (QMGR_PN_PATH, [QMGR_CMD, "-c", "p n @default"])
  435. self.node_info[QMGR_PN_OUT] = value
  436. # Hook information
  437. value = (QMGR_PH_PATH, [QMGR_CMD, "-c", "p h @default"])
  438. self.hook_info[QMGR_PH_OUT] = value
  439. value = (QMGR_LPBSHOOK_PATH, [QMGR_CMD, "-c", "l pbshook"])
  440. self.hook_info[QMGR_LPBSHOOK_OUT] = value
  441. # Reservation information
  442. value = (PBS_RSTAT_PATH, [PBS_RSTAT_CMD])
  443. self.resv_info[PBS_RSTAT_OUT] = value
  444. value = (PBS_RSTAT_F_PATH, [PBS_RSTAT_CMD, "-f"])
  445. self.resv_info[PBS_RSTAT_F_OUT] = value
  446. # Scheduler information
  447. value = (QMGR_LSCHED_PATH, [QMGR_CMD, "-c", "l sched"])
  448. self.sched_info[QMGR_LSCHED_OUT] = value
  449. if self.server_info_avail:
  450. # Server priv and logs
  451. value = (SVR_PRIV_PATH, None)
  452. self.server_info[SVR_PRIV] = value
  453. value = (SVR_LOGS_PATH, None)
  454. self.server_info[SVR_LOGS] = value
  455. value = (ACCT_LOGS_PATH, None)
  456. self.server_info[ACCT_LOGS] = value
  457. # Core file information
  458. value = (CORE_SERVER_PATH, None)
  459. self.core_info[CORE_SERVER] = value
  460. if self.mom_info_avail:
  461. # Mom priv and logs
  462. value = (MOM_PRIV_PATH, None)
  463. self.node_info[MOM_PRIV] = value
  464. value = (MOM_LOGS_PATH, None)
  465. self.node_info[MOM_LOGS] = value
  466. # Core file information
  467. value = (CORE_MOM_PATH, None)
  468. self.core_info[CORE_MOM] = value
  469. if self.comm_info_avail:
  470. # Comm information
  471. value = (COMM_LOGS_PATH, None)
  472. self.comm_info[COMM_LOGS] = value
  473. if self.sched_info_avail:
  474. # Scheduler logs and priv
  475. value = (DFLT_SCHED_PRIV_PATH, None)
  476. self.sched_info[SCHED_PRIV] = value
  477. value = (DFLT_SCHED_LOGS_PATH, None)
  478. self.sched_info[SCHED_LOGS] = value
  479. # Core file information
  480. value = (CORE_SCHED_PATH, None)
  481. self.core_info[CORE_SCHED] = value
  482. # System information
  483. value = (PBS_PROBE_PATH, [PBS_PROBE_CMD, "-v"])
  484. self.sys_info[PBS_PROBE_OUT] = value
  485. # We'll append hostname to this later (see capture_system_info)
  486. value = (PBS_HOSTN_PATH, [PBS_HOSTN_CMD, "-v"])
  487. self.sys_info[PBS_HOSTN_OUT] = value
  488. value = (PBS_ENV_PATH, None)
  489. self.sys_info[PBS_ENVIRONMENT] = value
  490. value = (OS_PATH, None)
  491. self.sys_info[OS_INFO] = value
  492. value = (PROCESS_PATH, ["ps", "aux", "|", "grep", "[p]bs"])
  493. self.sys_info[PROCESS_INFO] = value
  494. value = (ETC_HOSTS_PATH,
  495. ["cat", os.path.join(os.sep, "etc", "hosts")])
  496. self.sys_info[ETC_HOSTS] = value
  497. value = (ETC_NSSWITCH_PATH,
  498. ["cat", os.path.join(os.sep, "etc", "nsswitch.conf")])
  499. self.sys_info[ETC_NSSWITCH_CONF] = value
  500. value = (LSOF_PBS_PATH, ["lsof", "|", "grep", "[p]bs"])
  501. self.sys_info[LSOF_PBS_OUT] = value
  502. value = (VMSTAT_PATH, ["vmstat"])
  503. self.sys_info[VMSTAT_OUT] = value
  504. value = (DF_H_PATH, ["df", "-h"])
  505. self.sys_info[DF_H_OUT] = value
  506. value = (DMESG_PATH, ["dmesg"])
  507. self.sys_info[DMESG_OUT] = value
  508. value = (PS_LEAF_PATH, ["ps", "-leaf"])
  509. self.sys_info[PS_LEAF_OUT] = value
  510. def __initialize_snapshot(self):
  511. """
  512. Create a snapshot directory along with the directory structure
  513. Also create a tarfile and add the snapshot dir if create_tar is True
  514. """
  515. os.mkdir(self.snapdir)
  516. if self.create_tar:
  517. self.outtar_fd = tarfile.open(self.outtar_path, "w:gz")
  518. dirs_in_snapshot = [SYS_DIR, CORE_DIR]
  519. if self.server_up:
  520. dirs_in_snapshot.extend([SERVER_DIR, JOB_DIR, HOOK_DIR, RESV_DIR,
  521. NODE_DIR, SCHED_DIR])
  522. if self.server_info_avail:
  523. dirs_in_snapshot.extend([SVR_PRIV_PATH, SVR_LOGS_PATH,
  524. ACCT_LOGS_PATH, DATASTORE_DIR])
  525. if self.mom_info_avail:
  526. dirs_in_snapshot.extend([MOM_PRIV_PATH, MOM_LOGS_PATH])
  527. if self.comm_info_avail:
  528. dirs_in_snapshot.append(COMM_LOGS_PATH)
  529. if self.sched_info_avail:
  530. dirs_in_snapshot.extend([DFLT_SCHED_LOGS_PATH,
  531. DFLT_SCHED_PRIV_PATH])
  532. for item in dirs_in_snapshot:
  533. rel_path = os.path.join(self.snapdir, item)
  534. os.makedirs(rel_path, 0755)
  535. def __capture_cmd_output(self, out_path, cmd, skip_anon=False,
  536. as_script=False, ret_out=False, sudo=False):
  537. """
  538. Run a command and capture its output
  539. :param out_path: path of the output file for this command
  540. :type out_path: str
  541. :param cmd: The command to execute
  542. :type cmd: list
  543. :param skip_anon: Skip anonymization even though anonymize is True?
  544. :type skip_anon: bool
  545. :param as_script: Passed to run_cmd()
  546. :type as_Script: bool
  547. :param ret_out: Return output of the command?
  548. :type ret_out: bool
  549. """
  550. retstr = None
  551. with open(out_path, "a+") as out_fd:
  552. try:
  553. self.du.run_cmd(cmd=cmd, stdout=out_fd,
  554. sudo=sudo, as_script=as_script)
  555. if ret_out:
  556. out_fd.seek(0, 0)
  557. retstr = out_fd.read()
  558. except OSError as e:
  559. # This usually happens when the command is not found
  560. # Just log and return
  561. self.logger.error(str(e))
  562. return
  563. if self.anonymize and not skip_anon:
  564. self.__anonymize_file(out_path)
  565. if self.create_tar:
  566. self.__add_to_archive(out_path)
  567. if ret_out:
  568. return retstr
  569. @staticmethod
  570. def __convert_flag_to_numeric(flag):
  571. """
  572. Convert a resource's flag attribute to its numeric equivalent
  573. :param flag: the resource flag to convert
  574. :type flag: string
  575. :returns: numeric value of the resource flag
  576. """
  577. ATR_DFLAG_USRD = 0x01
  578. ATR_DFLAG_USWR = 0x02
  579. ATR_DFLAG_OPRD = 0x04
  580. ATR_DFLAG_OPWR = 0x08
  581. ATR_DFLAG_MGRD = 0x10
  582. ATR_DFLAG_MGWR = 0x20
  583. ATR_DFLAG_RASSN = 0x4000
  584. ATR_DFLAG_ANASSN = 0x8000
  585. ATR_DFLAG_FNASSN = 0x10000
  586. ATR_DFLAG_CVTSLT = 0x20000
  587. NO_USER_SET = (ATR_DFLAG_USRD | ATR_DFLAG_OPRD | ATR_DFLAG_MGRD |
  588. ATR_DFLAG_OPWR | ATR_DFLAG_MGWR)
  589. READ_WRITE = (ATR_DFLAG_USRD | ATR_DFLAG_OPRD | ATR_DFLAG_MGRD |
  590. ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR)
  591. resc_flag = READ_WRITE
  592. if "q" in flag:
  593. resc_flag |= ATR_DFLAG_RASSN
  594. if "f" in flag:
  595. resc_flag |= ATR_DFLAG_FNASSN
  596. if "n" in flag:
  597. resc_flag |= ATR_DFLAG_ANASSN
  598. if "h" in flag:
  599. resc_flag |= ATR_DFLAG_CVTSLT
  600. if "r" in flag:
  601. resc_flag &= ~READ_WRITE
  602. resc_flag |= NO_USER_SET
  603. if "i" in flag:
  604. resc_flag &= ~READ_WRITE
  605. resc_flag |= (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
  606. ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)
  607. return resc_flag
  608. @staticmethod
  609. def __convert_type_to_numeric(attr_type):
  610. """
  611. Convert a resource's type attribute to its numeric equivalent
  612. :param attr_type: the type to convert
  613. :type attr_type: string
  614. :returns: Numeric equivalent of attr_type
  615. """
  616. PBS_ATTR_TYPE_TO_INT = {
  617. "long": 1,
  618. "string": 3,
  619. "string_array": 4,
  620. "size": 5,
  621. "boolean": 11,
  622. "float": 14,
  623. }
  624. return PBS_ATTR_TYPE_TO_INT[attr_type.strip()]
  625. def __capture_trace_from_core(self, core_file_name, exec_path, out_path):
  626. """
  627. Capture stack strace from the core file specified
  628. :param core_file_name: name of the core file
  629. :type core_file_name: str
  630. :param exec_path: path to the executable which generated the core
  631. :type exec_path: str
  632. :param out_path: ofile to print the trace out to
  633. :type out_path: str
  634. """
  635. self.logger.info("capturing stack trace from core file " +
  636. core_file_name)
  637. # Create a gdb-python script to capture backtrace from core
  638. gdb_python = """
  639. import gdb
  640. gdb.execute("file %s")
  641. gdb.execute("core %s")
  642. o = gdb.execute("thread apply all bt", to_string=True)
  643. print(o)
  644. gdb.execute("quit")
  645. quit()
  646. """ % (exec_path, core_file_name)
  647. # Remove tabs from triple quoted strings
  648. gdb_python = gdb_python.replace("\t", "")
  649. # Write the gdb-python script in a temporary file
  650. fn = self.du.create_temp_file(body=gdb_python)
  651. # Catch the stack trace using gdb
  652. gdb_cmd = ["gdb", "-P", fn]
  653. with open(out_path, "w") as outfd:
  654. self.du.run_cmd(cmd=gdb_cmd, stdout=outfd, stderr=STDOUT,
  655. sudo=self.with_sudo)
  656. # Remove the temp file
  657. os.remove(fn)
  658. if self.create_tar:
  659. self.__add_to_archive(out_path)
  660. def __capture_logs(self, pbs_logdir, snap_logdir, num_days_logs,
  661. sudo=False):
  662. """
  663. Capture specific logs for the days mentioned
  664. :param pbs_logdir: path to the PBS logs directory (source)
  665. :type pbs_logdir: str
  666. :param snap_logdir: path to the snapshot logs directory (destination)
  667. :type snap_logdir: str
  668. :param num_days_logs: Number of days of logs to capture
  669. :type num_days_logs: int
  670. :param sudo: copy logs with sudo?
  671. :type sudo: bool
  672. """
  673. if num_days_logs < 1:
  674. self.logger.debug("Number of days of logs < 1, skipping")
  675. return
  676. end_time = self.server.ctime
  677. start_time = end_time - ((num_days_logs - 1) * 24 * 60 * 60)
  678. # Get the list of log file names to capture
  679. pbs_logfiles = self.log_utils.get_log_files(self.primary_host,
  680. path=pbs_logdir,
  681. start=start_time,
  682. end=end_time, sudo=sudo)
  683. if len(pbs_logfiles) == 0:
  684. self.logger.debug(pbs_logdir + "not found/accessible")
  685. return
  686. self.logger.debug("Capturing " + str(num_days_logs) +
  687. " days of logs from " + pbs_logdir)
  688. # Make sure that the target log dir exists
  689. if not os.path.isdir(snap_logdir):
  690. os.makedirs(snap_logdir)
  691. # Go over the list and copy over each log file
  692. for pbs_logfile in pbs_logfiles:
  693. snap_logfile = os.path.join(snap_logdir,
  694. os.path.basename(pbs_logfile))
  695. pbs_logfile = pbs_logfile
  696. self.du.run_copy(src=pbs_logfile, dest=snap_logfile,
  697. recursive=False,
  698. preserve_permission=False,
  699. sudo=sudo)
  700. if sudo:
  701. # Copying files with sudo makes root the owner, set it to the
  702. # current user
  703. self.du.chown(path=snap_logfile, uid=os.getuid(),
  704. gid=os.getgid(), sudo=self.with_sudo)
  705. # Anonymize accounting logs
  706. if self.anonymize and "accounting" in snap_logdir:
  707. anon = self.anon_obj.anonymize_accounting_log(snap_logfile)
  708. if anon is not None:
  709. anon_fd = open(snap_logfile, "w")
  710. anon_fd.write("\n".join(anon))
  711. anon_fd.close()
  712. if self.create_tar:
  713. self.__add_to_archive(snap_logfile)
  714. def __evaluate_core_file(self, file_path, core_dir):
  715. """
  716. Check whether the specified file is a core dump
  717. If yes, capture its stack trace and store it
  718. :param file_path: path to the file
  719. :type file_path: str
  720. :param core_dir: path to directory to store core information
  721. :type core_dir: str
  722. :returns: True if this was a valid core file, otherwise False
  723. """
  724. if not self.du.isfile(path=file_path, sudo=self.with_sudo):
  725. self.logger.debug("Could not find file path " + str(file_path))
  726. return False
  727. # Get the header of this file
  728. ret = self.du.run_cmd(cmd=["file", file_path], sudo=self.with_sudo)
  729. if ret['err'] is not None and len(ret['err']) != 0:
  730. self.logger.error(
  731. "\'file\' command failed with error: " + ret['err'] +
  732. " on file: " + str(file_path))
  733. return False
  734. file_header = ret["out"][0]
  735. if "core file" not in file_header:
  736. return False
  737. # Identify the program which created this core file
  738. header_list = file_header.split()
  739. if "from" not in header_list:
  740. return False
  741. exec_index = header_list.index("from") + 1
  742. exec_name = header_list[exec_index].replace("\'", "")
  743. exec_name = exec_name.replace(",", "")
  744. # Capture the stack trace from this core file
  745. filename = os.path.basename(file_path)
  746. core_dest = os.path.join(core_dir, filename)
  747. if not os.path.isdir(core_dir):
  748. os.makedirs(core_dir, 0755)
  749. self.__capture_trace_from_core(file_path, exec_name,
  750. core_dest)
  751. # Delete the core file itself
  752. if os.path.isfile(file_path):
  753. os.remove(file_path)
  754. return True
  755. def __anonymize_file(self, file_path):
  756. """
  757. Anonymize/obfuscate a file to remove sensitive information
  758. :param file_path: path to the file to anonymize
  759. :type file_path: str
  760. """
  761. if not self.anonymize or self.anon_obj is None:
  762. return
  763. self.logger.debug("Anonymizing " + file_path)
  764. # Anonymizing a file requires editing it, so make sure the user owns it
  765. self.du.chown(path=file_path, uid=os.getuid(),
  766. gid=os.getgid(), sudo=self.with_sudo)
  767. file_name = os.path.basename(file_path)
  768. if file_name == "sched_config":
  769. self.anon_obj.anonymize_sched_config(self.scheduler)
  770. self.scheduler.apply_config(path=file_path, validate=False)
  771. elif file_name == "resource_group":
  772. anon = self.anon_obj.anonymize_resource_group(file_path)
  773. if anon is not None:
  774. with open(file_path, "w") as rgfd:
  775. rgfd.write("\n".join(anon))
  776. elif file_name in FILE_TABULAR:
  777. self.anon_obj.anonymize_file_tabular(file_path, inplace=True)
  778. else:
  779. self.anon_obj.anonymize_file_kv(file_path, inplace=True)
  780. def __copy_dir_with_core(self, src_path, dest_path, core_dir,
  781. except_list=None, only_core=False, sudo=False):
  782. """
  783. Copy over a directory recursively which might have core files
  784. When a core file is found, capture the stack trace from it
  785. :param src_path: path of the source directory
  786. :type src_path: str
  787. :param dest_path: path of the destination directory
  788. :type dest_path: str
  789. :param core_dir: path to the directory to store core files' trace
  790. :type core_dir: str
  791. :param except_list: list of files/directories (basenames) to exclude
  792. :type except_list: list
  793. :param only_core: Copy over only core files?
  794. :type only_core: bool
  795. :param sudo: Copy with sudo?
  796. :type sudo: bool
  797. """
  798. if except_list is None:
  799. except_list = []
  800. # This can happen when -o is a path that we are capturing
  801. # Just return success
  802. if os.path.basename(src_path) == self.snapshot_name:
  803. self.logger.debug("src_path %s seems to be snapshot directory,"
  804. "ignoring" % src_path)
  805. return
  806. dir_list = self.du.listdir(path=src_path, fullpath=False,
  807. sudo=sudo)
  808. if dir_list is None:
  809. self.logger.info("Can't find/access " + src_path)
  810. return
  811. # Go over the list and copy over everything
  812. # If we find a core file, we'll store backtrace from it inside
  813. # core_file_bt
  814. for item in dir_list:
  815. if item in except_list:
  816. continue
  817. item_src_path = os.path.join(src_path, item)
  818. if not only_core:
  819. item_dest_path = os.path.join(dest_path, item)
  820. else:
  821. item_dest_path = core_dir
  822. # We can't directly use 'recursive' argument of run_copy
  823. # to copy the entire directory tree as we need to take care
  824. # of the 'except_list'. So, we recursively explore the whole
  825. # tree and copy over files individually.
  826. if self.du.isdir(path=item_src_path, sudo=sudo):
  827. # Make sure that the directory exists in the snapshot
  828. if not self.du.isdir(path=item_dest_path):
  829. # Create the directory
  830. os.makedirs(item_dest_path, 0755)
  831. # Recursive call to copy contents of the directory
  832. self.__copy_dir_with_core(item_src_path, item_dest_path,
  833. core_dir, except_list, only_core,
  834. sudo=sudo)
  835. else:
  836. # Copy the file over
  837. item_src_path = item_src_path
  838. try:
  839. self.du.run_copy(src=item_src_path, dest=item_dest_path,
  840. recursive=False,
  841. preserve_permission=False,
  842. level=logging.DEBUG, sudo=sudo)
  843. if sudo:
  844. # Copying files with sudo makes root the owner,
  845. # set it to the current user
  846. self.du.chown(path=item_dest_path, uid=os.getuid(),
  847. gid=os.getgid(), sudo=self.with_sudo)
  848. except OSError:
  849. self.logger.error("Could not copy %s" % item_src_path)
  850. continue
  851. # Check if this is a core file
  852. # If it is then this method will capture its stack trace
  853. is_core = self.__evaluate_core_file(item_dest_path, core_dir)
  854. # If it was a core file, then it's already been captured
  855. if is_core:
  856. continue
  857. # If only_core is True and this was not a core file, then we
  858. # should delete it
  859. if only_core:
  860. os.remove(item_dest_path)
  861. else:
  862. # This was not a core file, and 'only_core' is not True
  863. # So, we need to capture & anonymize this file
  864. self.__anonymize_file(item_dest_path)
  865. if self.create_tar:
  866. self.__add_to_archive(item_dest_path)
  867. def __capture_mom_priv(self):
  868. """
  869. Capture mom_priv information
  870. """
  871. pbs_home = self.pbs_home
  872. pbs_mom_priv = os.path.join(pbs_home, "mom_priv")
  873. snap_mom_priv = os.path.join(self.snapdir, MOM_PRIV_PATH)
  874. core_dir = os.path.join(self.snapdir, CORE_MOM_PATH)
  875. self.__copy_dir_with_core(pbs_mom_priv, snap_mom_priv, core_dir,
  876. sudo=self.with_sudo)
  877. def __add_to_archive(self, dest_path, src_path=None):
  878. """
  879. Add a file to the output tarball and delete the original file
  880. :param dest_path: path to the file inside the target tarball
  881. :type dest_path: str
  882. :param src_path: path to the file to add, if different than dest_path
  883. :type src_path: str
  884. """
  885. if src_path is None:
  886. src_path = dest_path
  887. self.logger.debug("Adding " + src_path + " to tarball " +
  888. self.outtar_path)
  889. # Add file to tar
  890. dest_relpath = os.path.relpath(dest_path, self.snapdir)
  891. path_in_tar = os.path.join(self.snapshot_name, dest_relpath)
  892. try:
  893. self.outtar_fd.add(src_path, arcname=path_in_tar)
  894. # Remove original file
  895. os.remove(src_path)
  896. except OSError:
  897. self.logger.error(
  898. "File %s could not be added to tarball" % (src_path))
  899. def __capture_svr_logs(self):
  900. """
  901. Capture server logs
  902. """
  903. pbs_logdir = os.path.join(self.pbs_home, "server_logs")
  904. snap_logdir = os.path.join(self.snapdir, SVR_LOGS_PATH)
  905. self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
  906. def __capture_acct_logs(self):
  907. """
  908. Capture accounting logs
  909. """
  910. pbs_logdir = os.path.join(self.pbs_home, "server_priv", "accounting")
  911. snap_logdir = os.path.join(self.snapdir, ACCT_LOGS_PATH)
  912. self.__capture_logs(pbs_logdir, snap_logdir, self.num_acct_logs,
  913. sudo=self.with_sudo)
  914. def __capture_sched_logs(self, pbs_logdir, snap_logdir):
  915. """
  916. Capture scheduler logs
  917. """
  918. self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
  919. def __capture_mom_logs(self):
  920. """
  921. Capture mom logs
  922. """
  923. pbs_home = self.pbs_home
  924. pbs_logdir = os.path.join(pbs_home, "mom_logs")
  925. snap_logdir = os.path.join(self.snapdir, MOM_LOGS_PATH)
  926. self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
  927. def __capture_comm_logs(self):
  928. """
  929. Capture pbs_comm logs
  930. """
  931. pbs_home = self.pbs_home
  932. pbs_logdir = os.path.join(pbs_home, "comm_logs")
  933. snap_logdir = os.path.join(self.snapdir, COMM_LOGS_PATH)
  934. self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
  935. def capture_server(self, with_svr_logs=False, with_acct_logs=False):
  936. """
  937. Capture PBS server specific information
  938. :param with_svr_logs: capture server logs as well?
  939. :type with_svr_logs: bool
  940. :param with_acct_logs: capture accounting logs as well?
  941. :type with_acct_logs: bool
  942. :returns: name of the output directory/tarfile containing the snapshot
  943. """
  944. self.logger.info("capturing server information")
  945. if self.server_up:
  946. # Go through 'server_info' and capture info that depends on
  947. # commands
  948. for (path, cmd_list) in self.server_info.values():
  949. if cmd_list is None:
  950. continue
  951. cmd_list_cpy = list(cmd_list)
  952. # Add the path to PBS_EXEC to the command path
  953. # The command path is the first entry in command list
  954. cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
  955. snap_path = os.path.join(self.snapdir, path)
  956. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  957. sudo=self.with_sudo)
  958. if self.server_info_avail:
  959. # Copy over 'server_priv', everything except accounting logs
  960. snap_server_priv = os.path.join(self.snapdir, SVR_PRIV_PATH)
  961. pbs_server_priv = os.path.join(self.pbs_home, "server_priv")
  962. core_dir = os.path.join(self.snapdir, CORE_SERVER_PATH)
  963. exclude_list = ["accounting"]
  964. self.__copy_dir_with_core(pbs_server_priv,
  965. snap_server_priv, core_dir, exclude_list,
  966. sudo=self.with_sudo)
  967. if with_svr_logs and self.num_daemon_logs > 0:
  968. # Capture server logs
  969. self.__capture_svr_logs()
  970. if with_acct_logs and self.num_acct_logs > 0:
  971. # Capture accounting logs
  972. self.__capture_acct_logs()
  973. if self.create_tar:
  974. return self.outtar_path
  975. else:
  976. return self.snapdir
  977. def capture_jobs(self):
  978. """
  979. Capture information related to jobs
  980. :returns: name of the output directory/tarfile containing the snapshot
  981. """
  982. self.logger.info("capturing jobs information")
  983. if self.server_up:
  984. # Go through 'job_info' and capture info that depends on commands
  985. for (path, cmd_list) in self.job_info.values():
  986. cmd_list_cpy = list(cmd_list)
  987. # Add the path to PBS_EXEC to the command path
  988. # The command path is the first entry in command list
  989. cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
  990. snap_path = os.path.join(self.snapdir, path)
  991. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  992. sudo=self.with_sudo)
  993. if self.create_tar:
  994. return self.outtar_path
  995. else:
  996. return self.snapdir
  997. def capture_nodes(self, with_mom_logs=False):
  998. """
  999. Capture information related to nodes & mom along with mom logs
  1000. :param with_mom_logs: Capture mom logs?
  1001. :type with_mom_logs: bool
  1002. :returns: name of the output directory/tarfile containing the snapshot
  1003. """
  1004. self.logger.info("capturing nodes & mom information")
  1005. if self.server_up:
  1006. # Go through 'node_info' and capture info that depends on commands
  1007. for (path, cmd_list) in self.node_info.values():
  1008. if cmd_list is None:
  1009. continue
  1010. cmd_list_cpy = list(cmd_list)
  1011. # Add the path to PBS_EXEC to the command path
  1012. # The command path is the first entry in command list
  1013. cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
  1014. snap_path = os.path.join(self.snapdir, path)
  1015. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  1016. sudo=self.with_sudo)
  1017. # Collect mom logs and priv
  1018. if self.mom_info_avail:
  1019. # Capture mom_priv info
  1020. self.__capture_mom_priv()
  1021. if with_mom_logs and self.num_daemon_logs > 0:
  1022. # Capture mom_logs
  1023. self.__capture_mom_logs()
  1024. if self.create_tar:
  1025. return self.outtar_path
  1026. else:
  1027. return self.snapdir
  1028. def capture_comms(self, with_comm_logs=False):
  1029. """
  1030. Capture Comm related information
  1031. :returns: name of the output directory/tarfile containing the snapshot
  1032. """
  1033. self.logger.info("capturing comm information")
  1034. # Capture comm logs
  1035. if self.comm_info_avail:
  1036. if self.num_daemon_logs > 0 and with_comm_logs:
  1037. self.__capture_comm_logs()
  1038. # If not already capturing server information, copy over server_priv
  1039. # as pbs_comm runs out of it
  1040. if not self.server_info_avail:
  1041. pbs_server_priv = os.path.join(self.pbs_home, "server_priv")
  1042. snap_server_priv = os.path.join(self.snapdir, SVR_PRIV_PATH)
  1043. core_dir = os.path.join(self.snapdir, CORE_SERVER_PATH)
  1044. exclude_list = ["accounting"]
  1045. self.__copy_dir_with_core(pbs_server_priv,
  1046. snap_server_priv, core_dir, exclude_list,
  1047. sudo=self.with_sudo)
  1048. if self.create_tar:
  1049. return self.outtar_path
  1050. else:
  1051. return self.snapdir
  1052. def capture_scheduler(self, with_sched_logs=False):
  1053. """
  1054. Capture information related to the scheduler
  1055. :param with_sched_logs: Capture scheduler logs?
  1056. :type with_sched_logs: bool
  1057. :returns: name of the output directory/tarfile containing the snapshot
  1058. """
  1059. self.logger.info("capturing scheduler information")
  1060. qmgr_lsched = None
  1061. if self.server_up:
  1062. # Go through 'sched_info' and capture info that depends on commands
  1063. for (path, cmd_list) in self.sched_info.values():
  1064. if cmd_list is None:
  1065. continue
  1066. cmd_list_cpy = list(cmd_list)
  1067. # Add the path to PBS_EXEC to the command path
  1068. # The command path is the first entry in command list
  1069. cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
  1070. snap_path = os.path.join(self.snapdir, path)
  1071. if "l sched" in cmd_list_cpy:
  1072. qmgr_lsched = self.__capture_cmd_output(snap_path,
  1073. cmd_list_cpy,
  1074. ret_out=True)
  1075. else:
  1076. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  1077. sudo=self.with_sudo)
  1078. # Capture sched_priv & sched_logs for all schedulers
  1079. if qmgr_lsched is not None and self.sched_info_avail:
  1080. sched_details = {}
  1081. sched_name = None
  1082. for line in qmgr_lsched.splitlines():
  1083. if line.startswith("Sched "):
  1084. sched_name = line.split("Sched ")[1]
  1085. sched_name = "".join(sched_name.split())
  1086. sched_details[sched_name] = {}
  1087. continue
  1088. if sched_name is not None:
  1089. line = "".join(line.split())
  1090. if line.startswith("sched_priv="):
  1091. sched_details[sched_name]["sched_priv"] = \
  1092. line.split("=")[1]
  1093. elif line.startswith("sched_log="):
  1094. sched_details[sched_name]["sched_log"] = \
  1095. line.split("=")[1]
  1096. for sched_name in sched_details:
  1097. # Capture sched_priv for the scheduler
  1098. if len(sched_details) == 1: # For pre-multisched outputs
  1099. pbs_sched_priv = os.path.join(self.pbs_home, "sched_priv")
  1100. else:
  1101. pbs_sched_priv = sched_details[sched_name]["sched_priv"]
  1102. if sched_name == "default" or len(sched_details) == 1:
  1103. snap_sched_priv = os.path.join(self.snapdir,
  1104. DFLT_SCHED_PRIV_PATH)
  1105. core_dir = os.path.join(self.snapdir, CORE_SCHED_PATH)
  1106. else:
  1107. dirname = DFLT_SCHED_PRIV_PATH + "_" + sched_name
  1108. coredirname = CORE_SCHED_PATH + "_" + sched_name
  1109. snap_sched_priv = os.path.join(self.snapdir, dirname)
  1110. os.makedirs(snap_sched_priv, 0755)
  1111. core_dir = os.path.join(self.snapdir, coredirname)
  1112. self.__copy_dir_with_core(pbs_sched_priv,
  1113. snap_sched_priv, core_dir,
  1114. sudo=self.with_sudo)
  1115. if with_sched_logs and self.num_daemon_logs > 0:
  1116. # Capture scheduler logs
  1117. if len(sched_details) == 1: # For pre-multisched outputs
  1118. pbs_sched_log = os.path.join(self.pbs_home,
  1119. "sched_logs")
  1120. else:
  1121. pbs_sched_log = sched_details[sched_name]["sched_log"]
  1122. if sched_name == "default" or len(sched_details) == 1:
  1123. snap_sched_log = os.path.join(self.snapdir,
  1124. DFLT_SCHED_LOGS_PATH)
  1125. else:
  1126. dirname = DFLT_SCHED_LOGS_PATH + "_" + sched_name
  1127. snap_sched_log = os.path.join(self.snapdir, dirname)
  1128. os.makedirs(snap_sched_log, 0755)
  1129. self.__capture_sched_logs(pbs_sched_log, snap_sched_log)
  1130. elif self.sched_info_avail:
  1131. # We don't know about other multi-scheds,
  1132. # but can still capture the default sched's logs & priv
  1133. pbs_sched_priv = os.path.join(self.pbs_home, "sched_priv")
  1134. snap_sched_priv = os.path.join(self.snapdir,
  1135. DFLT_SCHED_PRIV_PATH)
  1136. core_dir = os.path.join(self.snapdir, CORE_SCHED_PATH)
  1137. self.__copy_dir_with_core(pbs_sched_priv,
  1138. snap_sched_priv, core_dir,
  1139. sudo=self.with_sudo)
  1140. if with_sched_logs and self.num_daemon_logs > 0:
  1141. pbs_sched_log = os.path.join(self.pbs_home,
  1142. "sched_logs")
  1143. snap_sched_log = os.path.join(self.snapdir,
  1144. DFLT_SCHED_LOGS_PATH)
  1145. self.__capture_sched_logs(pbs_sched_log, snap_sched_log)
  1146. if self.create_tar:
  1147. return self.outtar_path
  1148. else:
  1149. return self.snapdir
  1150. def capture_hooks(self):
  1151. """
  1152. Capture information related to hooks
  1153. :returns: name of the output directory/tarfile containing the snapshot
  1154. """
  1155. self.logger.info("capturing hooks information")
  1156. # Go through 'hook_info' and capture info that depends on commands
  1157. for (path, cmd_list) in self.hook_info.values():
  1158. if cmd_list is None:
  1159. continue
  1160. cmd_list_cpy = list(cmd_list)
  1161. # Add the path to PBS_EXEC to the command path
  1162. # The command path is the first entry in command list
  1163. cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
  1164. snap_path = os.path.join(self.snapdir, path)
  1165. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  1166. sudo=self.with_sudo)
  1167. if self.create_tar:
  1168. return self.outtar_path
  1169. else:
  1170. return self.snapdir
  1171. def capture_reservations(self):
  1172. """
  1173. Capture information related to reservations
  1174. :returns: name of the output directory/tarfile containing the snapshot
  1175. """
  1176. self.logger.info("capturing reservations information")
  1177. # Go through 'resv_info' and capture info that depends on commands
  1178. for (path, cmd_list) in self.resv_info.values():
  1179. if cmd_list is None:
  1180. continue
  1181. cmd_list_cpy = list(cmd_list)
  1182. # Add the path to PBS_EXEC to the command path
  1183. # The command path is the first entry in command list
  1184. cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
  1185. snap_path = os.path.join(self.snapdir, path)
  1186. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  1187. sudo=self.with_sudo)
  1188. if self.create_tar:
  1189. return self.outtar_path
  1190. else:
  1191. return self.snapdir
  1192. def capture_datastore(self, with_db_logs=False):
  1193. """
  1194. Capture information related to datastore
  1195. :returns: name of the output directory/tarfile containing the snapshot
  1196. """
  1197. self.logger.info("capturing datastore information")
  1198. if with_db_logs and self.num_daemon_logs > 0:
  1199. # Capture database logs
  1200. pbs_logdir = os.path.join(self.pbs_home, PG_LOGS_PATH)
  1201. snap_logdir = os.path.join(self.snapdir, PG_LOGS_PATH)
  1202. self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs,
  1203. sudo=self.with_sudo)
  1204. if self.create_tar:
  1205. return self.outtar_path
  1206. else:
  1207. return self.snapdir
  1208. def capture_pbs_conf(self):
  1209. """
  1210. Capture pbs.conf file
  1211. :returns: name of the output directory/tarfile containing the snapshot
  1212. """
  1213. # Capture pbs.conf
  1214. self.logger.info("capturing pbs.conf")
  1215. snap_confpath = os.path.join(self.snapdir, PBS_CONF_PATH)
  1216. with open(snap_confpath, "w") as fd:
  1217. for k, v in self.server.pbs_conf.items():
  1218. fd.write(k + "=" + str(v) + "\n")
  1219. if self.create_tar:
  1220. self.__add_to_archive(snap_confpath)
  1221. return self.outtar_path
  1222. else:
  1223. return self.snapdir
  1224. def capture_system_info(self):
  1225. """
  1226. Capture system related information
  1227. :returns: name of the output directory/tarfile containing the snapshot
  1228. """
  1229. self.logger.info("capturing system information")
  1230. sudo_cmds = [PBS_PROBE_OUT, LSOF_PBS_OUT, DMESG_OUT]
  1231. as_script_cmds = [PROCESS_INFO, LSOF_PBS_OUT]
  1232. pbs_cmds = [PBS_PROBE_OUT, PBS_HOSTN_OUT]
  1233. sudo = False
  1234. host_platform = self.du.get_platform(self.primary_host)
  1235. win_platform = False
  1236. if host_platform.startswith("win"):
  1237. win_platform = True
  1238. # Capture information that's dependent on commands
  1239. for (key, values) in self.sys_info.iteritems():
  1240. (path, cmd_list) = values
  1241. if cmd_list is None:
  1242. continue
  1243. # For Windows, only capture PBS commands
  1244. if win_platform and (key not in pbs_cmds):
  1245. continue
  1246. cmd_list_cpy = list(cmd_list)
  1247. # Find the full path to the command on the host
  1248. if key in pbs_cmds:
  1249. cmd_full = os.path.join(self.pbs_exec, cmd_list_cpy[0])
  1250. else:
  1251. cmd_full = self.du.which(self.primary_host, cmd_list_cpy[0])
  1252. # du.which() returns the name of the command passed if
  1253. # it can't find the command
  1254. if cmd_full is cmd_list_cpy[0]:
  1255. continue
  1256. cmd_list_cpy[0] = cmd_full
  1257. # Handle special commands
  1258. if "pbs_hostn" in cmd_list_cpy[0]:
  1259. # Append hostname to the command list
  1260. cmd_list_cpy.append(self.primary_host)
  1261. if key in as_script_cmds:
  1262. as_script = True
  1263. if key in sudo_cmds and self.with_sudo:
  1264. # Because this cmd needs to be run in a script,
  1265. # PTL run_cmd's sudo will try to run the script
  1266. # itself with sudo, not the cmd
  1267. # So, append sudo as a prefix to the cmd instead
  1268. cmd_list_cpy[0] = "sudo " + cmd_list_cpy[0]
  1269. else:
  1270. as_script = False
  1271. if key in sudo_cmds:
  1272. sudo = self.with_sudo
  1273. snap_path = os.path.join(self.snapdir, path)
  1274. self.__capture_cmd_output(snap_path, cmd_list_cpy,
  1275. skip_anon=True, as_script=as_script,
  1276. sudo=sudo)
  1277. # Capture platform dependent information
  1278. if win_platform:
  1279. # Capture process information using tasklist command
  1280. cmd = ["tasklist", ["/v"]]
  1281. snap_path = PROCESS_PATH
  1282. self.__capture_cmd_output(snap_path, cmd,
  1283. sudo=self.with_sudo)
  1284. # Capture OS/platform information
  1285. self.logger.info("capturing OS information")
  1286. snap_ospath = os.path.join(self.snapdir, OS_PATH)
  1287. with open(snap_ospath, "w") as osfd:
  1288. osinfo = self.du.get_os_info(self.primary_host)
  1289. osfd.write(osinfo)
  1290. if self.create_tar:
  1291. self.__add_to_archive(snap_ospath)
  1292. # Capture pbs_environment
  1293. self.logger.info("capturing pbs_environment")
  1294. snap_envpath = os.path.join(self.snapdir, PBS_ENV_PATH)
  1295. if self.server.pbs_env is not None:
  1296. with open(snap_envpath, "w") as envfd:
  1297. for k, v in self.server.pbs_env.iteritems():
  1298. envfd.write(k + "=" + v + "\n")
  1299. if self.create_tar:
  1300. self.__add_to_archive(snap_envpath)
  1301. if self.create_tar:
  1302. return self.outtar_path
  1303. else:
  1304. return self.snapdir
  1305. def capture_pbs_logs(self):
  1306. """
  1307. Capture PBSPro logs from all relevant hosts
  1308. :returns: name of the output directory/tarfile containing the snapshot
  1309. """
  1310. self.logger.info("capturing PBSPro logs")
  1311. if self.num_daemon_logs > 0:
  1312. # Capture server logs
  1313. if self.server_info_avail:
  1314. self.__capture_svr_logs()
  1315. # Capture sched logs for all schedulers
  1316. if self.sched_info_avail:
  1317. if self.server_up:
  1318. sched_info = self.server.status(SCHED)
  1319. for sched in sched_info:
  1320. sched_name = sched["id"]
  1321. pbs_sched_log = sched["sched_log"]
  1322. if sched_name != "default":
  1323. snap_sched_log = DFLT_SCHED_LOGS_PATH + \
  1324. "_" + sched["id"]
  1325. else:
  1326. snap_sched_log = DFLT_SCHED_LOGS_PATH
  1327. snap_sched_log = os.path.join(self.snapdir,
  1328. snap_sched_log)
  1329. self.__capture_sched_logs(pbs_sched_log,
  1330. snap_sched_log)
  1331. else:
  1332. # Capture the default sched's logs
  1333. pbs_sched_log = os.path.join(self.pbs_home,
  1334. "sched_logs")
  1335. snap_sched_log = os.path.join(self.snapdir,
  1336. DFLT_SCHED_LOGS_PATH)
  1337. self.__capture_sched_logs(pbs_sched_log, snap_sched_log)
  1338. # Capture mom & comm logs
  1339. if self.mom_info_avail:
  1340. self.__capture_mom_logs()
  1341. if self.comm_info_avail:
  1342. self.__capture_comm_logs()
  1343. if self.num_acct_logs > 0:
  1344. # Capture accounting logs
  1345. self.__capture_acct_logs()
  1346. if self.create_tar:
  1347. return self.outtar_path
  1348. else:
  1349. return self.snapdir
  1350. def capture_all(self):
  1351. """
  1352. Capture a snapshot from the PBS system
  1353. :returns: name of the output directory/tarfile containing the snapshot
  1354. """
  1355. # Capture Server related information
  1356. self.capture_server(with_svr_logs=True, with_acct_logs=True)
  1357. # Capture scheduler information
  1358. self.capture_scheduler(with_sched_logs=True)
  1359. # Capture jobs related information
  1360. self.capture_jobs()
  1361. # Capture nodes relateed information
  1362. self.capture_nodes(with_mom_logs=True)
  1363. # Capture comm related information
  1364. self.capture_comms(with_comm_logs=True)
  1365. # Capture hooks related information
  1366. self.capture_hooks()
  1367. # Capture reservations related information
  1368. self.capture_reservations()
  1369. # Capture datastore related information
  1370. self.capture_datastore(with_db_logs=True)
  1371. # Capture pbs.conf
  1372. self.capture_pbs_conf()
  1373. # Capture system related information
  1374. self.capture_system_info()
  1375. if self.create_tar:
  1376. return self.outtar_path
  1377. else:
  1378. return self.snapdir
  1379. def finalize(self):
  1380. """
  1381. Capture some common information and perform cleanup
  1382. """
  1383. if self.finalized:
  1384. # This function is non-reenterant
  1385. # So just return if it's already been called once
  1386. self.logger.debug("finalize() already called once, skipping it.")
  1387. return
  1388. self.finalized = True
  1389. if self.anonymize:
  1390. # Print out number of bad accounting records:
  1391. if self.anon_obj.num_bad_acct_records > 0:
  1392. self.logger.error("Number of bad accounting records found: " +
  1393. str(self.anon_obj.num_bad_acct_records))
  1394. if self.mapfile is not None:
  1395. # Print out obfuscation map
  1396. try:
  1397. with open(self.mapfile, "w") as mapfd:
  1398. mapfd.write(str(self.anon_obj))
  1399. except Exception:
  1400. self.logger.error("Error writing out the map file " +
  1401. self.mapfile)
  1402. # Record timestamp of the snapshot
  1403. snap_ctimepath = os.path.join(self.snapdir, CTIME_PATH)
  1404. with open(snap_ctimepath, "w") as ctimefd:
  1405. ctimefd.write(str(self.server.ctime) + "\n")
  1406. if self.create_tar:
  1407. self.__add_to_archive(snap_ctimepath)
  1408. # If the caller was pbs_snapshot, add its log file to the tarball
  1409. if self.create_tar and self.log_path is not None:
  1410. snap_logpath = os.path.join(self.snapdir, self.log_filename)
  1411. self.__add_to_archive(snap_logpath, self.log_path)
  1412. # Cleanup
  1413. if self.create_tar:
  1414. # Close the output tarfile
  1415. self.outtar_fd.close()
  1416. # Remove the snapshot directory
  1417. self.du.rm(path=self.snapdir, recursive=True, force=True)