1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621 |
- # coding: utf-8
- # Copyright (C) 1994-2018 Altair Engineering, Inc.
- # For more information, contact Altair at www.altair.com.
- #
- # This file is part of the PBS Professional ("PBS Pro") software.
- #
- # Open Source License Information:
- #
- # PBS Pro is free software. You can redistribute it and/or modify it under the
- # terms of the GNU Affero General Public License as published by the Free
- # Software Foundation, either version 3 of the License, or (at your option) any
- # later version.
- #
- # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
- # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- # FOR A PARTICULAR PURPOSE.
- # See the GNU Affero General Public License for more details.
- #
- # You should have received a copy of the GNU Affero General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- #
- # Commercial License Information:
- #
- # For a copy of the commercial license terms and conditions,
- # go to: (http://www.pbspro.com/UserArea/agreement.html)
- # or contact the Altair Legal Department.
- #
- # Altair’s dual-license business model allows companies, individuals, and
- # organizations to create proprietary derivative works of PBS Pro and
- # distribute them - whether embedded or bundled with other software -
- # under a commercial license agreement.
- #
- # Use of Altair’s trademarks, including but not limited to "PBS™",
- # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
- # trademark licensing policies.
- import os
- import time
- import tarfile
- import logging
- import socket
- from subprocess import STDOUT
- from ptl.lib.pbs_testlib import Server, Scheduler, SCHED
- from ptl.lib.pbs_ifl_mock import *
- from ptl.utils.pbs_dshutils import DshUtils
- from ptl.utils.pbs_logutils import PBSLogUtils
- from ptl.utils.pbs_anonutils import PBSAnonymizer
- # Define an enum which is used to label various pieces of information
- ( # qstat outputs
- QSTAT_B_OUT,
- QSTAT_BF_OUT,
- QSTAT_OUT,
- QSTAT_F_OUT,
- QSTAT_T_OUT,
- QSTAT_TF_OUT,
- QSTAT_X_OUT,
- QSTAT_XF_OUT,
- QSTAT_NS_OUT,
- QSTAT_FX_DSV_OUT,
- QSTAT_F_DSV_OUT,
- QSTAT_F_JSON_OUT,
- QSTAT_Q_OUT,
- QSTAT_QF_OUT,
- # qmgr outputs
- QMGR_PS_OUT,
- QMGR_PH_OUT,
- QMGR_LPBSHOOK_OUT,
- QMGR_LSCHED_OUT,
- QMGR_PN_OUT,
- QMGR_PR_OUT,
- # pbsnodes outputs
- PBSNODES_VA_OUT,
- PBSNODES_A_OUT,
- PBSNODES_AVSJ_OUT,
- PBSNODES_ASJ_OUT,
- PBSNODES_AVS_OUT,
- PBSNODES_AS_OUT,
- PBSNODES_AFDSV_OUT,
- PBSNODES_AVFDSV_OUT,
- PBSNODES_AVFJSON_OUT,
- # pbs_rstat outputs
- PBS_RSTAT_OUT,
- PBS_RSTAT_F_OUT,
- # PBS config related outputs
- PBS_CONF,
- PBS_PROBE_OUT,
- PBS_HOSTN_OUT,
- PBS_ENVIRONMENT,
- # System related outputs
- OS_INFO,
- PROCESS_INFO,
- LSOF_PBS_OUT,
- ETC_HOSTS,
- ETC_NSSWITCH_CONF,
- VMSTAT_OUT,
- DF_H_OUT,
- DMESG_OUT,
- PS_LEAF_OUT,
- # Logs
- ACCT_LOGS,
- SVR_LOGS,
- SCHED_LOGS,
- MOM_LOGS,
- PG_LOGS,
- COMM_LOGS,
- # Daemon priv directories
- SVR_PRIV,
- MOM_PRIV,
- SCHED_PRIV,
- # Core file information
- CORE_SCHED,
- CORE_SERVER,
- CORE_MOM,
- # Miscellaneous
- CTIME) = range(57)
- # Define paths to various files/directories with respect to the snapshot
- # server/
- SERVER_DIR = "server"
- QSTAT_B_PATH = os.path.join(SERVER_DIR, "qstat_B.out")
- QSTAT_BF_PATH = os.path.join(SERVER_DIR, "qstat_Bf.out")
- QMGR_PS_PATH = os.path.join(SERVER_DIR, "qmgr_ps.out")
- QSTAT_Q_PATH = os.path.join(SERVER_DIR, "qstat_Q.out")
- QSTAT_QF_PATH = os.path.join(SERVER_DIR, "qstat_Qf.out")
- QMGR_PR_PATH = os.path.join(SERVER_DIR, "qmgr_pr.out")
- # server_priv/
- SVR_PRIV_PATH = "server_priv"
- ACCT_LOGS_PATH = os.path.join("server_priv", "accounting")
- # server_logs/
- SVR_LOGS_PATH = "server_logs"
- # job/
- JOB_DIR = "job"
- QSTAT_PATH = os.path.join(JOB_DIR, "qstat.out")
- QSTAT_F_PATH = os.path.join(JOB_DIR, "qstat_f.out")
- QSTAT_T_PATH = os.path.join(JOB_DIR, "qstat_t.out")
- QSTAT_TF_PATH = os.path.join(JOB_DIR, "qstat_tf.out")
- QSTAT_X_PATH = os.path.join(JOB_DIR, "qstat_x.out")
- QSTAT_XF_PATH = os.path.join(JOB_DIR, "qstat_xf.out")
- QSTAT_NS_PATH = os.path.join(JOB_DIR, "qstat_ns.out")
- QSTAT_FX_DSV_PATH = os.path.join(JOB_DIR, "qstat_fx_F_dsv.out")
- QSTAT_F_DSV_PATH = os.path.join(JOB_DIR, "qstat_f_F_dsv.out")
- QSTAT_F_JSON_PATH = os.path.join(JOB_DIR, "qstat_f_F_json.out")
- # node/
- NODE_DIR = "node"
- PBSNODES_VA_PATH = os.path.join(NODE_DIR, "pbsnodes_va.out")
- PBSNODES_A_PATH = os.path.join(NODE_DIR, "pbsnodes_a.out")
- PBSNODES_AVSJ_PATH = os.path.join(NODE_DIR, "pbsnodes_avSj.out")
- PBSNODES_ASJ_PATH = os.path.join(NODE_DIR, "pbsnodes_aSj.out")
- PBSNODES_AVS_PATH = os.path.join(NODE_DIR, "pbsnodes_avS.out")
- PBSNODES_AS_PATH = os.path.join(NODE_DIR, "pbsnodes_aS.out")
- PBSNODES_AFDSV_PATH = os.path.join(NODE_DIR, "pbsnodes_aFdsv.out")
- PBSNODES_AVFDSV_PATH = os.path.join(NODE_DIR, "pbsnodes_avFdsv.out")
- PBSNODES_AVFJSON_PATH = os.path.join(NODE_DIR, "pbsnodes_avFjson.out")
- QMGR_PN_PATH = os.path.join(NODE_DIR, "qmgr_pn_default.out")
- # mom_priv/
- MOM_PRIV_PATH = "mom_priv"
- # mom_logs/
- MOM_LOGS_PATH = "mom_logs"
- # comm_logs/
- COMM_LOGS_PATH = "comm_logs"
- # hook/
- HOOK_DIR = "hook"
- QMGR_PH_PATH = os.path.join(HOOK_DIR, "qmgr_ph_default.out")
- QMGR_LPBSHOOK_PATH = os.path.join(HOOK_DIR, "qmgr_lpbshook.out")
- # scheduler/
- SCHED_DIR = "scheduler"
- QMGR_LSCHED_PATH = os.path.join(SCHED_DIR, "qmgr_lsched.out")
- # sched_priv/
- DFLT_SCHED_PRIV_PATH = "sched_priv"
- # sched_logs/
- DFLT_SCHED_LOGS_PATH = "sched_logs"
- # reservation/
- RESV_DIR = "reservation"
- PBS_RSTAT_PATH = os.path.join(RESV_DIR, "pbs_rstat.out")
- PBS_RSTAT_F_PATH = os.path.join(RESV_DIR, "pbs_rstat_f.out")
- # datastore/
- DATASTORE_DIR = "datastore"
- PG_LOGS_PATH = os.path.join(DATASTORE_DIR, "pg_log")
- # core_file_bt/
- CORE_DIR = "core_file_bt"
- CORE_SERVER_PATH = os.path.join(CORE_DIR, "server_priv")
- CORE_SCHED_PATH = os.path.join(CORE_DIR, "sched_priv")
- CORE_MOM_PATH = os.path.join(CORE_DIR, "mom_priv")
- # system/
- SYS_DIR = "system"
- PBS_PROBE_PATH = os.path.join(SYS_DIR, "pbs_probe_v.out")
- PBS_HOSTN_PATH = os.path.join(SYS_DIR, "pbs_hostn_v.out")
- PBS_ENV_PATH = os.path.join(SYS_DIR, "pbs_environment")
- OS_PATH = os.path.join(SYS_DIR, "os_info")
- PROCESS_PATH = os.path.join(SYS_DIR, "process_info")
- ETC_HOSTS_PATH = os.path.join(SYS_DIR, "etc_hosts")
- ETC_NSSWITCH_PATH = os.path.join(SYS_DIR, "etc_nsswitch_conf")
- LSOF_PBS_PATH = os.path.join(SYS_DIR, "lsof_pbs.out")
- VMSTAT_PATH = os.path.join(SYS_DIR, "vmstat.out")
- DF_H_PATH = os.path.join(SYS_DIR, "df_h.out")
- DMESG_PATH = os.path.join(SYS_DIR, "dmesg.out")
- PS_LEAF_PATH = os.path.join(SYS_DIR, "ps_leaf.out")
- # top-level
- PBS_CONF_PATH = "pbs.conf"
- CTIME_PATH = "ctime"
- # Define paths to PBS commands used to capture data with respect to PBS_EXEC
- QSTAT_CMD = os.path.join("bin", "qstat")
- PBSNODES_CMD = os.path.join("bin", "pbsnodes")
- QMGR_CMD = os.path.join("bin", "qmgr")
- PBS_RSTAT_CMD = os.path.join("bin", "pbs_rstat")
- PBS_PROBE_CMD = os.path.join("sbin", "pbs_probe")
- PBS_HOSTN_CMD = os.path.join("bin", "pbs_hostn")
- # A global list of files which contain data in tabular form
- FILE_TABULAR = ["qstat.out", "qstat_t.out", "qstat_x.out", "qstat_ns.out",
- "pbsnodes_aS.out", "pbsnodes_aSj.out", "pbsnodes_avS.out",
- "pbsnodes_avSj.out", "qstat_Q.out", "qstat_B.out",
- "pbs_rstat.out"]
- class PBSSnapUtils(object):
- """
- Wrapper class around _PBSSnapUtils
- This makes sure that we do necessay cleanup before destroying objects
- """
- def __init__(self, out_dir, primary_host=None, acct_logs=None,
- daemon_logs=None, map_file=None, anonymize=None,
- create_tar=False, log_path=None, with_sudo=False):
- self.out_dir = out_dir
- self.primary_host = primary_host
- self.acct_logs = acct_logs
- self.srvc_logs = daemon_logs
- self.map_file = map_file
- self.anonymize = anonymize
- self.create_tar = create_tar
- self.log_path = log_path
- self.with_sudo = with_sudo
- self.utils_obj = None
- def __enter__(self):
- self.utils_obj = _PBSSnapUtils(self.out_dir, self.primary_host,
- self.acct_logs, self.srvc_logs,
- self.map_file, self.anonymize,
- self.create_tar, self.log_path,
- self.with_sudo)
- return self.utils_obj
- def __exit__(self, exc_type, exc_value, traceback):
- # Do some cleanup
- self.utils_obj.finalize()
- return False
- class _PBSSnapUtils(object):
- """
- PBS snapshot utilities
- """
- def __init__(self, out_dir, primary_host=None, acct_logs=None,
- daemon_logs=None, map_file=None, anonymize=False,
- create_tar=False, log_path=None, with_sudo=False):
- """
- Initialize a PBSSnapUtils object with the arguments specified
- :param out_dir: path to the directory where snapshot will be created
- :type out_dir: str
- :param primary_host: Name of the primary host to capture
- :type primary_host: str or None
- :param acct_logs: number of accounting logs to capture
- :type acct_logs: int or None
- :param daemon_logs: number of daemon logs to capture
- :type daemon_logs: int or None
- :param map_file: Path to map file for anonymization map
- :type map_file str or None
- :param anonymize: anonymize data?
- :type anonymize: bool
- :param create_tar: Create a tarball of the output snapshot?
- :type create_tar: bool or None
- :param log_path: Path to pbs_snapshot's log file
- :type log_path: str or None
- :param with_sudo: Capture relevant information with sudo?
- :type with_sudo: bool
- """
- self.logger = logging.getLogger(__name__)
- self.du = DshUtils()
- self.server_info = {}
- self.job_info = {}
- self.node_info = {}
- self.comm_info = {}
- self.hook_info = {}
- self.sched_info = {}
- self.resv_info = {}
- self.sys_info = {}
- self.core_info = {}
- self.anon_obj = None
- self.all_hosts = []
- self.server = None
- self.mom = None
- self.comm = None
- self.scheduler = None
- self.log_utils = PBSLogUtils()
- self.outtar_path = None
- self.outtar_fd = None
- self.create_tar = create_tar
- self.snapshot_name = None
- self.with_sudo = with_sudo
- self.log_path = log_path
- self.server_up = False
- self.server_info_avail = False
- self.mom_info_avail = False
- self.comm_info_avail = False
- self.sched_info_avail = False
- if self.log_path is not None:
- self.log_filename = os.path.basename(self.log_path)
- else:
- self.log_filename = None
- # finalize() is called by the context's __exit__() automatically
- # however, finalize() is non-reenterant, so set a flag to keep
- # track of whether it has been called or not.
- self.finalized = False
- # Parse the input arguments
- timestamp_str = time.strftime("%Y%m%d_%H_%M_%S")
- self.snapshot_name = "snapshot_" + timestamp_str
- # Make sure that the target directory exists
- dir_path = os.path.abspath(out_dir)
- if not os.path.isdir(dir_path):
- raise ValueError("Target directory either doesn't exist" +
- "or not accessible. Quitting.")
- self.snapdir = os.path.join(dir_path, self.snapshot_name)
- self.num_acct_logs = int(acct_logs) if acct_logs is not None else 0
- if daemon_logs is not None:
- self.num_daemon_logs = int(daemon_logs)
- else:
- self.num_daemon_logs = 0
- self.mapfile = map_file
- if primary_host is None:
- primary_host = socket.gethostname()
- # Check which of the PBS daemons' information is available
- self.server = Server(primary_host)
- self.scheduler = Scheduler(server=self.server)
- daemon_status = self.server.pi.status()
- if len(daemon_status) > 0 and daemon_status['rc'] == 0 and \
- len(daemon_status['err']) == 0:
- for d_stat in daemon_status['out']:
- if d_stat.startswith("pbs_server"):
- self.server_info_avail = True
- if "not running" not in d_stat:
- self.server_up = True
- elif d_stat.startswith("pbs_sched"):
- self.sched_info_avail = True
- elif d_stat.startswith("pbs_mom"):
- self.mom_info_avail = True
- elif d_stat.startswith("pbs_comm"):
- self.comm_info_avail = True
- self.custom_rscs = None
- if self.server_up:
- self.custom_rscs = self.server.parse_resources()
- # Store paths to PBS_HOME and PBS_EXEC
- self.pbs_home = self.server.pbs_conf["PBS_HOME"]
- self.pbs_exec = self.server.pbs_conf["PBS_EXEC"]
- # Add self.primary_host to the list of hosts
- self.primary_host = self.server.hostname
- # If output needs to be a tarball, create the tarfile name
- # tarfile name = <output directory name>.tgz
- self.outtar_path = self.snapdir + ".tgz"
- # Set up some infrastructure
- self.__init_cmd_path_map()
- # Create the snapshot directory tree
- self.__initialize_snapshot()
- # Create a PBSAnonymizer object
- self.anonymize = anonymize
- if self.anonymize:
- del_attrs = [ATTR_v, ATTR_e, ATTR_mailfrom, ATTR_m, ATTR_name,
- ATTR_jobdir, ATTR_submit_arguments, ATTR_o, ATTR_S]
- obf_attrs = [ATTR_euser, ATTR_egroup, ATTR_project, ATTR_A,
- ATTR_operators, ATTR_managers, ATTR_g, ATTR_M,
- ATTR_u, ATTR_SvrHost, ATTR_aclgroup, ATTR_acluser,
- ATTR_aclResvgroup, ATTR_aclResvuser, ATTR_SchedHost,
- ATTR_aclResvhost, ATTR_aclhost, ATTR_owner,
- ATTR_exechost, ATTR_NODE_Host, ATTR_NODE_Mom,
- ATTR_rescavail + ".host", ATTR_rescavail + ".vnode",
- ATTR_auth_u, ATTR_auth_g, ATTR_resv_owner]
- obf_rsc_attrs = []
- if self.custom_rscs is not None:
- for rsc in self.custom_rscs.keys():
- obf_rsc_attrs.append(rsc)
- self.anon_obj = PBSAnonymizer(attr_delete=del_attrs,
- attr_val=obf_attrs,
- resc_key=obf_rsc_attrs)
- def __init_cmd_path_map(self):
- """
- Fill in various dicts which map the commands used for capturing
- various classes of outputs along with the paths to the files where
- they will be stored inside the snapshot as a tuple.
- """
- if self.server_up:
- # Server information
- value = (QSTAT_B_PATH, [QSTAT_CMD, "-B"])
- self.server_info[QSTAT_B_OUT] = value
- value = (QSTAT_BF_PATH, [QSTAT_CMD, "-Bf"])
- self.server_info[QSTAT_BF_OUT] = value
- value = (QMGR_PS_PATH, [QMGR_CMD, "-c", "p s"])
- self.server_info[QMGR_PS_OUT] = value
- value = (QSTAT_Q_PATH, [QSTAT_CMD, "-Q"])
- self.server_info[QSTAT_Q_OUT] = value
- value = (QSTAT_QF_PATH, [QSTAT_CMD, "-Qf"])
- self.server_info[QSTAT_QF_OUT] = value
- value = (QMGR_PR_PATH, [QMGR_CMD, "-c", "p r"])
- self.server_info[QMGR_PR_OUT] = value
- # Job information
- value = (QSTAT_PATH, [QSTAT_CMD])
- self.job_info[QSTAT_OUT] = value
- value = (QSTAT_F_PATH, [QSTAT_CMD, "-f"])
- self.job_info[QSTAT_F_OUT] = value
- value = (QSTAT_T_PATH, [QSTAT_CMD, "-t"])
- self.job_info[QSTAT_T_OUT] = value
- value = (QSTAT_TF_PATH, [QSTAT_CMD, "-tf"])
- self.job_info[QSTAT_TF_OUT] = value
- value = (QSTAT_X_PATH, [QSTAT_CMD, "-x"])
- self.job_info[QSTAT_X_OUT] = value
- value = (QSTAT_XF_PATH, [QSTAT_CMD, "-xf"])
- self.job_info[QSTAT_XF_OUT] = value
- value = (QSTAT_NS_PATH, [QSTAT_CMD, "-ns"])
- self.job_info[QSTAT_NS_OUT] = value
- value = (QSTAT_FX_DSV_PATH, [QSTAT_CMD, "-fx", "-F", "dsv"])
- self.job_info[QSTAT_FX_DSV_OUT] = value
- value = (QSTAT_F_DSV_PATH, [QSTAT_CMD, "-f", "-F", "dsv"])
- self.job_info[QSTAT_F_DSV_OUT] = value
- value = (QSTAT_F_JSON_PATH, [QSTAT_CMD, "-f", "-F", "json"])
- self.job_info[QSTAT_F_JSON_OUT] = value
- # Node information
- value = (PBSNODES_VA_PATH, [PBSNODES_CMD, "-va"])
- self.node_info[PBSNODES_VA_OUT] = value
- value = (PBSNODES_A_PATH, [PBSNODES_CMD, "-a"])
- self.node_info[PBSNODES_A_OUT] = value
- value = (PBSNODES_AVSJ_PATH, [PBSNODES_CMD, "-avSj"])
- self.node_info[PBSNODES_AVSJ_OUT] = value
- value = (PBSNODES_ASJ_PATH, [PBSNODES_CMD, "-aSj"])
- self.node_info[PBSNODES_ASJ_OUT] = value
- value = (PBSNODES_AVS_PATH, [PBSNODES_CMD, "-avS"])
- self.node_info[PBSNODES_AVS_OUT] = value
- value = (PBSNODES_AS_PATH, [PBSNODES_CMD, "-aS"])
- self.node_info[PBSNODES_AS_OUT] = value
- value = (PBSNODES_AFDSV_PATH, [PBSNODES_CMD, "-aFdsv"])
- self.node_info[PBSNODES_AFDSV_OUT] = value
- value = (PBSNODES_AVFDSV_PATH, [PBSNODES_CMD, "-avFdsv"])
- self.node_info[PBSNODES_AVFDSV_OUT] = value
- value = (PBSNODES_AVFJSON_PATH, [PBSNODES_CMD, "-avFjson"])
- self.node_info[PBSNODES_AVFJSON_OUT] = value
- value = (QMGR_PN_PATH, [QMGR_CMD, "-c", "p n @default"])
- self.node_info[QMGR_PN_OUT] = value
- # Hook information
- value = (QMGR_PH_PATH, [QMGR_CMD, "-c", "p h @default"])
- self.hook_info[QMGR_PH_OUT] = value
- value = (QMGR_LPBSHOOK_PATH, [QMGR_CMD, "-c", "l pbshook"])
- self.hook_info[QMGR_LPBSHOOK_OUT] = value
- # Reservation information
- value = (PBS_RSTAT_PATH, [PBS_RSTAT_CMD])
- self.resv_info[PBS_RSTAT_OUT] = value
- value = (PBS_RSTAT_F_PATH, [PBS_RSTAT_CMD, "-f"])
- self.resv_info[PBS_RSTAT_F_OUT] = value
- # Scheduler information
- value = (QMGR_LSCHED_PATH, [QMGR_CMD, "-c", "l sched"])
- self.sched_info[QMGR_LSCHED_OUT] = value
- if self.server_info_avail:
- # Server priv and logs
- value = (SVR_PRIV_PATH, None)
- self.server_info[SVR_PRIV] = value
- value = (SVR_LOGS_PATH, None)
- self.server_info[SVR_LOGS] = value
- value = (ACCT_LOGS_PATH, None)
- self.server_info[ACCT_LOGS] = value
- # Core file information
- value = (CORE_SERVER_PATH, None)
- self.core_info[CORE_SERVER] = value
- if self.mom_info_avail:
- # Mom priv and logs
- value = (MOM_PRIV_PATH, None)
- self.node_info[MOM_PRIV] = value
- value = (MOM_LOGS_PATH, None)
- self.node_info[MOM_LOGS] = value
- # Core file information
- value = (CORE_MOM_PATH, None)
- self.core_info[CORE_MOM] = value
- if self.comm_info_avail:
- # Comm information
- value = (COMM_LOGS_PATH, None)
- self.comm_info[COMM_LOGS] = value
- if self.sched_info_avail:
- # Scheduler logs and priv
- value = (DFLT_SCHED_PRIV_PATH, None)
- self.sched_info[SCHED_PRIV] = value
- value = (DFLT_SCHED_LOGS_PATH, None)
- self.sched_info[SCHED_LOGS] = value
- # Core file information
- value = (CORE_SCHED_PATH, None)
- self.core_info[CORE_SCHED] = value
- # System information
- value = (PBS_PROBE_PATH, [PBS_PROBE_CMD, "-v"])
- self.sys_info[PBS_PROBE_OUT] = value
- # We'll append hostname to this later (see capture_system_info)
- value = (PBS_HOSTN_PATH, [PBS_HOSTN_CMD, "-v"])
- self.sys_info[PBS_HOSTN_OUT] = value
- value = (PBS_ENV_PATH, None)
- self.sys_info[PBS_ENVIRONMENT] = value
- value = (OS_PATH, None)
- self.sys_info[OS_INFO] = value
- value = (PROCESS_PATH, ["ps", "aux", "|", "grep", "[p]bs"])
- self.sys_info[PROCESS_INFO] = value
- value = (ETC_HOSTS_PATH,
- ["cat", os.path.join(os.sep, "etc", "hosts")])
- self.sys_info[ETC_HOSTS] = value
- value = (ETC_NSSWITCH_PATH,
- ["cat", os.path.join(os.sep, "etc", "nsswitch.conf")])
- self.sys_info[ETC_NSSWITCH_CONF] = value
- value = (LSOF_PBS_PATH, ["lsof", "|", "grep", "[p]bs"])
- self.sys_info[LSOF_PBS_OUT] = value
- value = (VMSTAT_PATH, ["vmstat"])
- self.sys_info[VMSTAT_OUT] = value
- value = (DF_H_PATH, ["df", "-h"])
- self.sys_info[DF_H_OUT] = value
- value = (DMESG_PATH, ["dmesg"])
- self.sys_info[DMESG_OUT] = value
- value = (PS_LEAF_PATH, ["ps", "-leaf"])
- self.sys_info[PS_LEAF_OUT] = value
- def __initialize_snapshot(self):
- """
- Create a snapshot directory along with the directory structure
- Also create a tarfile and add the snapshot dir if create_tar is True
- """
- os.mkdir(self.snapdir)
- if self.create_tar:
- self.outtar_fd = tarfile.open(self.outtar_path, "w:gz")
- dirs_in_snapshot = [SYS_DIR, CORE_DIR]
- if self.server_up:
- dirs_in_snapshot.extend([SERVER_DIR, JOB_DIR, HOOK_DIR, RESV_DIR,
- NODE_DIR, SCHED_DIR])
- if self.server_info_avail:
- dirs_in_snapshot.extend([SVR_PRIV_PATH, SVR_LOGS_PATH,
- ACCT_LOGS_PATH, DATASTORE_DIR])
- if self.mom_info_avail:
- dirs_in_snapshot.extend([MOM_PRIV_PATH, MOM_LOGS_PATH])
- if self.comm_info_avail:
- dirs_in_snapshot.append(COMM_LOGS_PATH)
- if self.sched_info_avail:
- dirs_in_snapshot.extend([DFLT_SCHED_LOGS_PATH,
- DFLT_SCHED_PRIV_PATH])
- for item in dirs_in_snapshot:
- rel_path = os.path.join(self.snapdir, item)
- os.makedirs(rel_path, 0755)
- def __capture_cmd_output(self, out_path, cmd, skip_anon=False,
- as_script=False, ret_out=False, sudo=False):
- """
- Run a command and capture its output
- :param out_path: path of the output file for this command
- :type out_path: str
- :param cmd: The command to execute
- :type cmd: list
- :param skip_anon: Skip anonymization even though anonymize is True?
- :type skip_anon: bool
- :param as_script: Passed to run_cmd()
- :type as_Script: bool
- :param ret_out: Return output of the command?
- :type ret_out: bool
- """
- retstr = None
- with open(out_path, "a+") as out_fd:
- try:
- self.du.run_cmd(cmd=cmd, stdout=out_fd,
- sudo=sudo, as_script=as_script)
- if ret_out:
- out_fd.seek(0, 0)
- retstr = out_fd.read()
- except OSError as e:
- # This usually happens when the command is not found
- # Just log and return
- self.logger.error(str(e))
- return
- if self.anonymize and not skip_anon:
- self.__anonymize_file(out_path)
- if self.create_tar:
- self.__add_to_archive(out_path)
- if ret_out:
- return retstr
- @staticmethod
- def __convert_flag_to_numeric(flag):
- """
- Convert a resource's flag attribute to its numeric equivalent
- :param flag: the resource flag to convert
- :type flag: string
- :returns: numeric value of the resource flag
- """
- ATR_DFLAG_USRD = 0x01
- ATR_DFLAG_USWR = 0x02
- ATR_DFLAG_OPRD = 0x04
- ATR_DFLAG_OPWR = 0x08
- ATR_DFLAG_MGRD = 0x10
- ATR_DFLAG_MGWR = 0x20
- ATR_DFLAG_RASSN = 0x4000
- ATR_DFLAG_ANASSN = 0x8000
- ATR_DFLAG_FNASSN = 0x10000
- ATR_DFLAG_CVTSLT = 0x20000
- NO_USER_SET = (ATR_DFLAG_USRD | ATR_DFLAG_OPRD | ATR_DFLAG_MGRD |
- ATR_DFLAG_OPWR | ATR_DFLAG_MGWR)
- READ_WRITE = (ATR_DFLAG_USRD | ATR_DFLAG_OPRD | ATR_DFLAG_MGRD |
- ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR)
- resc_flag = READ_WRITE
- if "q" in flag:
- resc_flag |= ATR_DFLAG_RASSN
- if "f" in flag:
- resc_flag |= ATR_DFLAG_FNASSN
- if "n" in flag:
- resc_flag |= ATR_DFLAG_ANASSN
- if "h" in flag:
- resc_flag |= ATR_DFLAG_CVTSLT
- if "r" in flag:
- resc_flag &= ~READ_WRITE
- resc_flag |= NO_USER_SET
- if "i" in flag:
- resc_flag &= ~READ_WRITE
- resc_flag |= (ATR_DFLAG_OPRD | ATR_DFLAG_OPWR |
- ATR_DFLAG_MGRD | ATR_DFLAG_MGWR)
- return resc_flag
- @staticmethod
- def __convert_type_to_numeric(attr_type):
- """
- Convert a resource's type attribute to its numeric equivalent
- :param attr_type: the type to convert
- :type attr_type: string
- :returns: Numeric equivalent of attr_type
- """
- PBS_ATTR_TYPE_TO_INT = {
- "long": 1,
- "string": 3,
- "string_array": 4,
- "size": 5,
- "boolean": 11,
- "float": 14,
- }
- return PBS_ATTR_TYPE_TO_INT[attr_type.strip()]
- def __capture_trace_from_core(self, core_file_name, exec_path, out_path):
- """
- Capture stack strace from the core file specified
- :param core_file_name: name of the core file
- :type core_file_name: str
- :param exec_path: path to the executable which generated the core
- :type exec_path: str
- :param out_path: ofile to print the trace out to
- :type out_path: str
- """
- self.logger.info("capturing stack trace from core file " +
- core_file_name)
- # Create a gdb-python script to capture backtrace from core
- gdb_python = """
- import gdb
- gdb.execute("file %s")
- gdb.execute("core %s")
- o = gdb.execute("thread apply all bt", to_string=True)
- print(o)
- gdb.execute("quit")
- quit()
- """ % (exec_path, core_file_name)
- # Remove tabs from triple quoted strings
- gdb_python = gdb_python.replace("\t", "")
- # Write the gdb-python script in a temporary file
- fn = self.du.create_temp_file(body=gdb_python)
- # Catch the stack trace using gdb
- gdb_cmd = ["gdb", "-P", fn]
- with open(out_path, "w") as outfd:
- self.du.run_cmd(cmd=gdb_cmd, stdout=outfd, stderr=STDOUT,
- sudo=self.with_sudo)
- # Remove the temp file
- os.remove(fn)
- if self.create_tar:
- self.__add_to_archive(out_path)
- def __capture_logs(self, pbs_logdir, snap_logdir, num_days_logs,
- sudo=False):
- """
- Capture specific logs for the days mentioned
- :param pbs_logdir: path to the PBS logs directory (source)
- :type pbs_logdir: str
- :param snap_logdir: path to the snapshot logs directory (destination)
- :type snap_logdir: str
- :param num_days_logs: Number of days of logs to capture
- :type num_days_logs: int
- :param sudo: copy logs with sudo?
- :type sudo: bool
- """
- if num_days_logs < 1:
- self.logger.debug("Number of days of logs < 1, skipping")
- return
- end_time = self.server.ctime
- start_time = end_time - ((num_days_logs - 1) * 24 * 60 * 60)
- # Get the list of log file names to capture
- pbs_logfiles = self.log_utils.get_log_files(self.primary_host,
- path=pbs_logdir,
- start=start_time,
- end=end_time, sudo=sudo)
- if len(pbs_logfiles) == 0:
- self.logger.debug(pbs_logdir + "not found/accessible")
- return
- self.logger.debug("Capturing " + str(num_days_logs) +
- " days of logs from " + pbs_logdir)
- # Make sure that the target log dir exists
- if not os.path.isdir(snap_logdir):
- os.makedirs(snap_logdir)
- # Go over the list and copy over each log file
- for pbs_logfile in pbs_logfiles:
- snap_logfile = os.path.join(snap_logdir,
- os.path.basename(pbs_logfile))
- pbs_logfile = pbs_logfile
- self.du.run_copy(src=pbs_logfile, dest=snap_logfile,
- recursive=False,
- preserve_permission=False,
- sudo=sudo)
- if sudo:
- # Copying files with sudo makes root the owner, set it to the
- # current user
- self.du.chown(path=snap_logfile, uid=os.getuid(),
- gid=os.getgid(), sudo=self.with_sudo)
- # Anonymize accounting logs
- if self.anonymize and "accounting" in snap_logdir:
- anon = self.anon_obj.anonymize_accounting_log(snap_logfile)
- if anon is not None:
- anon_fd = open(snap_logfile, "w")
- anon_fd.write("\n".join(anon))
- anon_fd.close()
- if self.create_tar:
- self.__add_to_archive(snap_logfile)
- def __evaluate_core_file(self, file_path, core_dir):
- """
- Check whether the specified file is a core dump
- If yes, capture its stack trace and store it
- :param file_path: path to the file
- :type file_path: str
- :param core_dir: path to directory to store core information
- :type core_dir: str
- :returns: True if this was a valid core file, otherwise False
- """
- if not self.du.isfile(path=file_path, sudo=self.with_sudo):
- self.logger.debug("Could not find file path " + str(file_path))
- return False
- # Get the header of this file
- ret = self.du.run_cmd(cmd=["file", file_path], sudo=self.with_sudo)
- if ret['err'] is not None and len(ret['err']) != 0:
- self.logger.error(
- "\'file\' command failed with error: " + ret['err'] +
- " on file: " + str(file_path))
- return False
- file_header = ret["out"][0]
- if "core file" not in file_header:
- return False
- # Identify the program which created this core file
- header_list = file_header.split()
- if "from" not in header_list:
- return False
- exec_index = header_list.index("from") + 1
- exec_name = header_list[exec_index].replace("\'", "")
- exec_name = exec_name.replace(",", "")
- # Capture the stack trace from this core file
- filename = os.path.basename(file_path)
- core_dest = os.path.join(core_dir, filename)
- if not os.path.isdir(core_dir):
- os.makedirs(core_dir, 0755)
- self.__capture_trace_from_core(file_path, exec_name,
- core_dest)
- # Delete the core file itself
- if os.path.isfile(file_path):
- os.remove(file_path)
- return True
- def __anonymize_file(self, file_path):
- """
- Anonymize/obfuscate a file to remove sensitive information
- :param file_path: path to the file to anonymize
- :type file_path: str
- """
- if not self.anonymize or self.anon_obj is None:
- return
- self.logger.debug("Anonymizing " + file_path)
- # Anonymizing a file requires editing it, so make sure the user owns it
- self.du.chown(path=file_path, uid=os.getuid(),
- gid=os.getgid(), sudo=self.with_sudo)
- file_name = os.path.basename(file_path)
- if file_name == "sched_config":
- self.anon_obj.anonymize_sched_config(self.scheduler)
- self.scheduler.apply_config(path=file_path, validate=False)
- elif file_name == "resource_group":
- anon = self.anon_obj.anonymize_resource_group(file_path)
- if anon is not None:
- with open(file_path, "w") as rgfd:
- rgfd.write("\n".join(anon))
- elif file_name in FILE_TABULAR:
- self.anon_obj.anonymize_file_tabular(file_path, inplace=True)
- else:
- self.anon_obj.anonymize_file_kv(file_path, inplace=True)
- def __copy_dir_with_core(self, src_path, dest_path, core_dir,
- except_list=None, only_core=False, sudo=False):
- """
- Copy over a directory recursively which might have core files
- When a core file is found, capture the stack trace from it
- :param src_path: path of the source directory
- :type src_path: str
- :param dest_path: path of the destination directory
- :type dest_path: str
- :param core_dir: path to the directory to store core files' trace
- :type core_dir: str
- :param except_list: list of files/directories (basenames) to exclude
- :type except_list: list
- :param only_core: Copy over only core files?
- :type only_core: bool
- :param sudo: Copy with sudo?
- :type sudo: bool
- """
- if except_list is None:
- except_list = []
- # This can happen when -o is a path that we are capturing
- # Just return success
- if os.path.basename(src_path) == self.snapshot_name:
- self.logger.debug("src_path %s seems to be snapshot directory,"
- "ignoring" % src_path)
- return
- dir_list = self.du.listdir(path=src_path, fullpath=False,
- sudo=sudo)
- if dir_list is None:
- self.logger.info("Can't find/access " + src_path)
- return
- # Go over the list and copy over everything
- # If we find a core file, we'll store backtrace from it inside
- # core_file_bt
- for item in dir_list:
- if item in except_list:
- continue
- item_src_path = os.path.join(src_path, item)
- if not only_core:
- item_dest_path = os.path.join(dest_path, item)
- else:
- item_dest_path = core_dir
- # We can't directly use 'recursive' argument of run_copy
- # to copy the entire directory tree as we need to take care
- # of the 'except_list'. So, we recursively explore the whole
- # tree and copy over files individually.
- if self.du.isdir(path=item_src_path, sudo=sudo):
- # Make sure that the directory exists in the snapshot
- if not self.du.isdir(path=item_dest_path):
- # Create the directory
- os.makedirs(item_dest_path, 0755)
- # Recursive call to copy contents of the directory
- self.__copy_dir_with_core(item_src_path, item_dest_path,
- core_dir, except_list, only_core,
- sudo=sudo)
- else:
- # Copy the file over
- item_src_path = item_src_path
- try:
- self.du.run_copy(src=item_src_path, dest=item_dest_path,
- recursive=False,
- preserve_permission=False,
- level=logging.DEBUG, sudo=sudo)
- if sudo:
- # Copying files with sudo makes root the owner,
- # set it to the current user
- self.du.chown(path=item_dest_path, uid=os.getuid(),
- gid=os.getgid(), sudo=self.with_sudo)
- except OSError:
- self.logger.error("Could not copy %s" % item_src_path)
- continue
- # Check if this is a core file
- # If it is then this method will capture its stack trace
- is_core = self.__evaluate_core_file(item_dest_path, core_dir)
- # If it was a core file, then it's already been captured
- if is_core:
- continue
- # If only_core is True and this was not a core file, then we
- # should delete it
- if only_core:
- os.remove(item_dest_path)
- else:
- # This was not a core file, and 'only_core' is not True
- # So, we need to capture & anonymize this file
- self.__anonymize_file(item_dest_path)
- if self.create_tar:
- self.__add_to_archive(item_dest_path)
- def __capture_mom_priv(self):
- """
- Capture mom_priv information
- """
- pbs_home = self.pbs_home
- pbs_mom_priv = os.path.join(pbs_home, "mom_priv")
- snap_mom_priv = os.path.join(self.snapdir, MOM_PRIV_PATH)
- core_dir = os.path.join(self.snapdir, CORE_MOM_PATH)
- self.__copy_dir_with_core(pbs_mom_priv, snap_mom_priv, core_dir,
- sudo=self.with_sudo)
- def __add_to_archive(self, dest_path, src_path=None):
- """
- Add a file to the output tarball and delete the original file
- :param dest_path: path to the file inside the target tarball
- :type dest_path: str
- :param src_path: path to the file to add, if different than dest_path
- :type src_path: str
- """
- if src_path is None:
- src_path = dest_path
- self.logger.debug("Adding " + src_path + " to tarball " +
- self.outtar_path)
- # Add file to tar
- dest_relpath = os.path.relpath(dest_path, self.snapdir)
- path_in_tar = os.path.join(self.snapshot_name, dest_relpath)
- try:
- self.outtar_fd.add(src_path, arcname=path_in_tar)
- # Remove original file
- os.remove(src_path)
- except OSError:
- self.logger.error(
- "File %s could not be added to tarball" % (src_path))
- def __capture_svr_logs(self):
- """
- Capture server logs
- """
- pbs_logdir = os.path.join(self.pbs_home, "server_logs")
- snap_logdir = os.path.join(self.snapdir, SVR_LOGS_PATH)
- self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
- def __capture_acct_logs(self):
- """
- Capture accounting logs
- """
- pbs_logdir = os.path.join(self.pbs_home, "server_priv", "accounting")
- snap_logdir = os.path.join(self.snapdir, ACCT_LOGS_PATH)
- self.__capture_logs(pbs_logdir, snap_logdir, self.num_acct_logs,
- sudo=self.with_sudo)
- def __capture_sched_logs(self, pbs_logdir, snap_logdir):
- """
- Capture scheduler logs
- """
- self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
- def __capture_mom_logs(self):
- """
- Capture mom logs
- """
- pbs_home = self.pbs_home
- pbs_logdir = os.path.join(pbs_home, "mom_logs")
- snap_logdir = os.path.join(self.snapdir, MOM_LOGS_PATH)
- self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
- def __capture_comm_logs(self):
- """
- Capture pbs_comm logs
- """
- pbs_home = self.pbs_home
- pbs_logdir = os.path.join(pbs_home, "comm_logs")
- snap_logdir = os.path.join(self.snapdir, COMM_LOGS_PATH)
- self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs)
- def capture_server(self, with_svr_logs=False, with_acct_logs=False):
- """
- Capture PBS server specific information
- :param with_svr_logs: capture server logs as well?
- :type with_svr_logs: bool
- :param with_acct_logs: capture accounting logs as well?
- :type with_acct_logs: bool
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing server information")
- if self.server_up:
- # Go through 'server_info' and capture info that depends on
- # commands
- for (path, cmd_list) in self.server_info.values():
- if cmd_list is None:
- continue
- cmd_list_cpy = list(cmd_list)
- # Add the path to PBS_EXEC to the command path
- # The command path is the first entry in command list
- cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
- snap_path = os.path.join(self.snapdir, path)
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- sudo=self.with_sudo)
- if self.server_info_avail:
- # Copy over 'server_priv', everything except accounting logs
- snap_server_priv = os.path.join(self.snapdir, SVR_PRIV_PATH)
- pbs_server_priv = os.path.join(self.pbs_home, "server_priv")
- core_dir = os.path.join(self.snapdir, CORE_SERVER_PATH)
- exclude_list = ["accounting"]
- self.__copy_dir_with_core(pbs_server_priv,
- snap_server_priv, core_dir, exclude_list,
- sudo=self.with_sudo)
- if with_svr_logs and self.num_daemon_logs > 0:
- # Capture server logs
- self.__capture_svr_logs()
- if with_acct_logs and self.num_acct_logs > 0:
- # Capture accounting logs
- self.__capture_acct_logs()
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_jobs(self):
- """
- Capture information related to jobs
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing jobs information")
- if self.server_up:
- # Go through 'job_info' and capture info that depends on commands
- for (path, cmd_list) in self.job_info.values():
- cmd_list_cpy = list(cmd_list)
- # Add the path to PBS_EXEC to the command path
- # The command path is the first entry in command list
- cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
- snap_path = os.path.join(self.snapdir, path)
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- sudo=self.with_sudo)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_nodes(self, with_mom_logs=False):
- """
- Capture information related to nodes & mom along with mom logs
- :param with_mom_logs: Capture mom logs?
- :type with_mom_logs: bool
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing nodes & mom information")
- if self.server_up:
- # Go through 'node_info' and capture info that depends on commands
- for (path, cmd_list) in self.node_info.values():
- if cmd_list is None:
- continue
- cmd_list_cpy = list(cmd_list)
- # Add the path to PBS_EXEC to the command path
- # The command path is the first entry in command list
- cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
- snap_path = os.path.join(self.snapdir, path)
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- sudo=self.with_sudo)
- # Collect mom logs and priv
- if self.mom_info_avail:
- # Capture mom_priv info
- self.__capture_mom_priv()
- if with_mom_logs and self.num_daemon_logs > 0:
- # Capture mom_logs
- self.__capture_mom_logs()
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_comms(self, with_comm_logs=False):
- """
- Capture Comm related information
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing comm information")
- # Capture comm logs
- if self.comm_info_avail:
- if self.num_daemon_logs > 0 and with_comm_logs:
- self.__capture_comm_logs()
- # If not already capturing server information, copy over server_priv
- # as pbs_comm runs out of it
- if not self.server_info_avail:
- pbs_server_priv = os.path.join(self.pbs_home, "server_priv")
- snap_server_priv = os.path.join(self.snapdir, SVR_PRIV_PATH)
- core_dir = os.path.join(self.snapdir, CORE_SERVER_PATH)
- exclude_list = ["accounting"]
- self.__copy_dir_with_core(pbs_server_priv,
- snap_server_priv, core_dir, exclude_list,
- sudo=self.with_sudo)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_scheduler(self, with_sched_logs=False):
- """
- Capture information related to the scheduler
- :param with_sched_logs: Capture scheduler logs?
- :type with_sched_logs: bool
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing scheduler information")
- qmgr_lsched = None
- if self.server_up:
- # Go through 'sched_info' and capture info that depends on commands
- for (path, cmd_list) in self.sched_info.values():
- if cmd_list is None:
- continue
- cmd_list_cpy = list(cmd_list)
- # Add the path to PBS_EXEC to the command path
- # The command path is the first entry in command list
- cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
- snap_path = os.path.join(self.snapdir, path)
- if "l sched" in cmd_list_cpy:
- qmgr_lsched = self.__capture_cmd_output(snap_path,
- cmd_list_cpy,
- ret_out=True)
- else:
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- sudo=self.with_sudo)
- # Capture sched_priv & sched_logs for all schedulers
- if qmgr_lsched is not None and self.sched_info_avail:
- sched_details = {}
- sched_name = None
- for line in qmgr_lsched.splitlines():
- if line.startswith("Sched "):
- sched_name = line.split("Sched ")[1]
- sched_name = "".join(sched_name.split())
- sched_details[sched_name] = {}
- continue
- if sched_name is not None:
- line = "".join(line.split())
- if line.startswith("sched_priv="):
- sched_details[sched_name]["sched_priv"] = \
- line.split("=")[1]
- elif line.startswith("sched_log="):
- sched_details[sched_name]["sched_log"] = \
- line.split("=")[1]
- for sched_name in sched_details:
- # Capture sched_priv for the scheduler
- if len(sched_details) == 1: # For pre-multisched outputs
- pbs_sched_priv = os.path.join(self.pbs_home, "sched_priv")
- else:
- pbs_sched_priv = sched_details[sched_name]["sched_priv"]
- if sched_name == "default" or len(sched_details) == 1:
- snap_sched_priv = os.path.join(self.snapdir,
- DFLT_SCHED_PRIV_PATH)
- core_dir = os.path.join(self.snapdir, CORE_SCHED_PATH)
- else:
- dirname = DFLT_SCHED_PRIV_PATH + "_" + sched_name
- coredirname = CORE_SCHED_PATH + "_" + sched_name
- snap_sched_priv = os.path.join(self.snapdir, dirname)
- os.makedirs(snap_sched_priv, 0755)
- core_dir = os.path.join(self.snapdir, coredirname)
- self.__copy_dir_with_core(pbs_sched_priv,
- snap_sched_priv, core_dir,
- sudo=self.with_sudo)
- if with_sched_logs and self.num_daemon_logs > 0:
- # Capture scheduler logs
- if len(sched_details) == 1: # For pre-multisched outputs
- pbs_sched_log = os.path.join(self.pbs_home,
- "sched_logs")
- else:
- pbs_sched_log = sched_details[sched_name]["sched_log"]
- if sched_name == "default" or len(sched_details) == 1:
- snap_sched_log = os.path.join(self.snapdir,
- DFLT_SCHED_LOGS_PATH)
- else:
- dirname = DFLT_SCHED_LOGS_PATH + "_" + sched_name
- snap_sched_log = os.path.join(self.snapdir, dirname)
- os.makedirs(snap_sched_log, 0755)
- self.__capture_sched_logs(pbs_sched_log, snap_sched_log)
- elif self.sched_info_avail:
- # We don't know about other multi-scheds,
- # but can still capture the default sched's logs & priv
- pbs_sched_priv = os.path.join(self.pbs_home, "sched_priv")
- snap_sched_priv = os.path.join(self.snapdir,
- DFLT_SCHED_PRIV_PATH)
- core_dir = os.path.join(self.snapdir, CORE_SCHED_PATH)
- self.__copy_dir_with_core(pbs_sched_priv,
- snap_sched_priv, core_dir,
- sudo=self.with_sudo)
- if with_sched_logs and self.num_daemon_logs > 0:
- pbs_sched_log = os.path.join(self.pbs_home,
- "sched_logs")
- snap_sched_log = os.path.join(self.snapdir,
- DFLT_SCHED_LOGS_PATH)
- self.__capture_sched_logs(pbs_sched_log, snap_sched_log)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_hooks(self):
- """
- Capture information related to hooks
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing hooks information")
- # Go through 'hook_info' and capture info that depends on commands
- for (path, cmd_list) in self.hook_info.values():
- if cmd_list is None:
- continue
- cmd_list_cpy = list(cmd_list)
- # Add the path to PBS_EXEC to the command path
- # The command path is the first entry in command list
- cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
- snap_path = os.path.join(self.snapdir, path)
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- sudo=self.with_sudo)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_reservations(self):
- """
- Capture information related to reservations
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing reservations information")
- # Go through 'resv_info' and capture info that depends on commands
- for (path, cmd_list) in self.resv_info.values():
- if cmd_list is None:
- continue
- cmd_list_cpy = list(cmd_list)
- # Add the path to PBS_EXEC to the command path
- # The command path is the first entry in command list
- cmd_list_cpy[0] = os.path.join(self.pbs_exec, cmd_list[0])
- snap_path = os.path.join(self.snapdir, path)
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- sudo=self.with_sudo)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_datastore(self, with_db_logs=False):
- """
- Capture information related to datastore
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing datastore information")
- if with_db_logs and self.num_daemon_logs > 0:
- # Capture database logs
- pbs_logdir = os.path.join(self.pbs_home, PG_LOGS_PATH)
- snap_logdir = os.path.join(self.snapdir, PG_LOGS_PATH)
- self.__capture_logs(pbs_logdir, snap_logdir, self.num_daemon_logs,
- sudo=self.with_sudo)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_pbs_conf(self):
- """
- Capture pbs.conf file
- :returns: name of the output directory/tarfile containing the snapshot
- """
- # Capture pbs.conf
- self.logger.info("capturing pbs.conf")
- snap_confpath = os.path.join(self.snapdir, PBS_CONF_PATH)
- with open(snap_confpath, "w") as fd:
- for k, v in self.server.pbs_conf.items():
- fd.write(k + "=" + str(v) + "\n")
- if self.create_tar:
- self.__add_to_archive(snap_confpath)
- return self.outtar_path
- else:
- return self.snapdir
- def capture_system_info(self):
- """
- Capture system related information
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing system information")
- sudo_cmds = [PBS_PROBE_OUT, LSOF_PBS_OUT, DMESG_OUT]
- as_script_cmds = [PROCESS_INFO, LSOF_PBS_OUT]
- pbs_cmds = [PBS_PROBE_OUT, PBS_HOSTN_OUT]
- sudo = False
- host_platform = self.du.get_platform(self.primary_host)
- win_platform = False
- if host_platform.startswith("win"):
- win_platform = True
- # Capture information that's dependent on commands
- for (key, values) in self.sys_info.iteritems():
- (path, cmd_list) = values
- if cmd_list is None:
- continue
- # For Windows, only capture PBS commands
- if win_platform and (key not in pbs_cmds):
- continue
- cmd_list_cpy = list(cmd_list)
- # Find the full path to the command on the host
- if key in pbs_cmds:
- cmd_full = os.path.join(self.pbs_exec, cmd_list_cpy[0])
- else:
- cmd_full = self.du.which(self.primary_host, cmd_list_cpy[0])
- # du.which() returns the name of the command passed if
- # it can't find the command
- if cmd_full is cmd_list_cpy[0]:
- continue
- cmd_list_cpy[0] = cmd_full
- # Handle special commands
- if "pbs_hostn" in cmd_list_cpy[0]:
- # Append hostname to the command list
- cmd_list_cpy.append(self.primary_host)
- if key in as_script_cmds:
- as_script = True
- if key in sudo_cmds and self.with_sudo:
- # Because this cmd needs to be run in a script,
- # PTL run_cmd's sudo will try to run the script
- # itself with sudo, not the cmd
- # So, append sudo as a prefix to the cmd instead
- cmd_list_cpy[0] = "sudo " + cmd_list_cpy[0]
- else:
- as_script = False
- if key in sudo_cmds:
- sudo = self.with_sudo
- snap_path = os.path.join(self.snapdir, path)
- self.__capture_cmd_output(snap_path, cmd_list_cpy,
- skip_anon=True, as_script=as_script,
- sudo=sudo)
- # Capture platform dependent information
- if win_platform:
- # Capture process information using tasklist command
- cmd = ["tasklist", ["/v"]]
- snap_path = PROCESS_PATH
- self.__capture_cmd_output(snap_path, cmd,
- sudo=self.with_sudo)
- # Capture OS/platform information
- self.logger.info("capturing OS information")
- snap_ospath = os.path.join(self.snapdir, OS_PATH)
- with open(snap_ospath, "w") as osfd:
- osinfo = self.du.get_os_info(self.primary_host)
- osfd.write(osinfo)
- if self.create_tar:
- self.__add_to_archive(snap_ospath)
- # Capture pbs_environment
- self.logger.info("capturing pbs_environment")
- snap_envpath = os.path.join(self.snapdir, PBS_ENV_PATH)
- if self.server.pbs_env is not None:
- with open(snap_envpath, "w") as envfd:
- for k, v in self.server.pbs_env.iteritems():
- envfd.write(k + "=" + v + "\n")
- if self.create_tar:
- self.__add_to_archive(snap_envpath)
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_pbs_logs(self):
- """
- Capture PBSPro logs from all relevant hosts
- :returns: name of the output directory/tarfile containing the snapshot
- """
- self.logger.info("capturing PBSPro logs")
- if self.num_daemon_logs > 0:
- # Capture server logs
- if self.server_info_avail:
- self.__capture_svr_logs()
- # Capture sched logs for all schedulers
- if self.sched_info_avail:
- if self.server_up:
- sched_info = self.server.status(SCHED)
- for sched in sched_info:
- sched_name = sched["id"]
- pbs_sched_log = sched["sched_log"]
- if sched_name != "default":
- snap_sched_log = DFLT_SCHED_LOGS_PATH + \
- "_" + sched["id"]
- else:
- snap_sched_log = DFLT_SCHED_LOGS_PATH
- snap_sched_log = os.path.join(self.snapdir,
- snap_sched_log)
- self.__capture_sched_logs(pbs_sched_log,
- snap_sched_log)
- else:
- # Capture the default sched's logs
- pbs_sched_log = os.path.join(self.pbs_home,
- "sched_logs")
- snap_sched_log = os.path.join(self.snapdir,
- DFLT_SCHED_LOGS_PATH)
- self.__capture_sched_logs(pbs_sched_log, snap_sched_log)
- # Capture mom & comm logs
- if self.mom_info_avail:
- self.__capture_mom_logs()
- if self.comm_info_avail:
- self.__capture_comm_logs()
- if self.num_acct_logs > 0:
- # Capture accounting logs
- self.__capture_acct_logs()
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def capture_all(self):
- """
- Capture a snapshot from the PBS system
- :returns: name of the output directory/tarfile containing the snapshot
- """
- # Capture Server related information
- self.capture_server(with_svr_logs=True, with_acct_logs=True)
- # Capture scheduler information
- self.capture_scheduler(with_sched_logs=True)
- # Capture jobs related information
- self.capture_jobs()
- # Capture nodes relateed information
- self.capture_nodes(with_mom_logs=True)
- # Capture comm related information
- self.capture_comms(with_comm_logs=True)
- # Capture hooks related information
- self.capture_hooks()
- # Capture reservations related information
- self.capture_reservations()
- # Capture datastore related information
- self.capture_datastore(with_db_logs=True)
- # Capture pbs.conf
- self.capture_pbs_conf()
- # Capture system related information
- self.capture_system_info()
- if self.create_tar:
- return self.outtar_path
- else:
- return self.snapdir
- def finalize(self):
- """
- Capture some common information and perform cleanup
- """
- if self.finalized:
- # This function is non-reenterant
- # So just return if it's already been called once
- self.logger.debug("finalize() already called once, skipping it.")
- return
- self.finalized = True
- if self.anonymize:
- # Print out number of bad accounting records:
- if self.anon_obj.num_bad_acct_records > 0:
- self.logger.error("Number of bad accounting records found: " +
- str(self.anon_obj.num_bad_acct_records))
- if self.mapfile is not None:
- # Print out obfuscation map
- try:
- with open(self.mapfile, "w") as mapfd:
- mapfd.write(str(self.anon_obj))
- except Exception:
- self.logger.error("Error writing out the map file " +
- self.mapfile)
- # Record timestamp of the snapshot
- snap_ctimepath = os.path.join(self.snapdir, CTIME_PATH)
- with open(snap_ctimepath, "w") as ctimefd:
- ctimefd.write(str(self.server.ctime) + "\n")
- if self.create_tar:
- self.__add_to_archive(snap_ctimepath)
- # If the caller was pbs_snapshot, add its log file to the tarball
- if self.create_tar and self.log_path is not None:
- snap_logpath = os.path.join(self.snapdir, self.log_filename)
- self.__add_to_archive(snap_logpath, self.log_path)
- # Cleanup
- if self.create_tar:
- # Close the output tarfile
- self.outtar_fd.close()
- # Remove the snapshot directory
- self.du.rm(path=self.snapdir, recursive=True, force=True)
|