pbs_snapshot 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  4. # For more information, contact Altair at www.altair.com.
  5. #
  6. # This file is part of the PBS Professional ("PBS Pro") software.
  7. #
  8. # Open Source License Information:
  9. #
  10. # PBS Pro is free software. You can redistribute it and/or modify it under the
  11. # terms of the GNU Affero General Public License as published by the Free
  12. # Software Foundation, either version 3 of the License, or (at your option) any
  13. # later version.
  14. #
  15. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  16. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17. # FOR A PARTICULAR PURPOSE.
  18. # See the GNU Affero General Public License for more details.
  19. #
  20. # You should have received a copy of the GNU Affero General Public License
  21. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. #
  23. # Commercial License Information:
  24. #
  25. # For a copy of the commercial license terms and conditions,
  26. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  27. # or contact the Altair Legal Department.
  28. #
  29. # Altair’s dual-license business model allows companies, individuals, and
  30. # organizations to create proprietary derivative works of PBS Pro and
  31. # distribute them - whether embedded or bundled with other software -
  32. # under a commercial license agreement.
  33. #
  34. # Use of Altair’s trademarks, including but not limited to "PBS™",
  35. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  36. # trademark licensing policies.
  37. import os
  38. import sys
  39. import getopt
  40. import errno
  41. import logging
  42. import ptl
  43. import time
  44. import tarfile
  45. from getopt import GetoptError
  46. from threading import Thread
  47. from ptl.lib.pbs_testlib import PtlConfig
  48. from ptl.utils.pbs_snaputils import PBSSnapUtils
  49. from ptl.utils.pbs_cliutils import CliUtils
  50. from ptl.utils.pbs_dshutils import DshUtils
  51. def trap_exceptions(etype, value, tb):
  52. """
  53. Trap SIGINT and SIGPIPE
  54. """
  55. # This is done so that any exceptions created by this method itself
  56. # are caught by the default excepthook to prevent endless recursion
  57. sys.excepthook = sys.__excepthook__
  58. if issubclass(etype, KeyboardInterrupt):
  59. pass
  60. elif issubclass(etype, IOError) and value.errno == errno.EPIPE:
  61. pass
  62. else:
  63. sys.__excepthook__(etype, value, tb)
  64. # Set sys.excepthook back to trap_exceptions to catch future exceptions
  65. sys.excepthook = trap_exceptions
  66. sys.excepthook = trap_exceptions
  67. def usage():
  68. msg = """
  69. Usage: pbs_snapshot -o <path to output tar file> [OPTION]
  70. Take snapshot of a PBS system and optionally capture logs for diagnostics
  71. -H <hostname> primary hostname to operate on
  72. Defaults to local host
  73. -l <loglevel> set log level to one of INFO, INFOCLI,
  74. INFOCLI2, DEBUG, DEBUG2, WARNING, ERROR
  75. or FATAL
  76. -h, --help display this usage message
  77. --daemon-logs=<num days> number of daemon logs to collect
  78. --accounting-logs=<num days> number of accounting logs to collect
  79. --additional-hosts=<hostname> collect data from additional hosts
  80. 'hostname' is a comma separated list
  81. --map=<file> file to store the map of obfuscated data
  82. --obfuscate obfuscates sensitive data
  83. --with-sudo Uses sudo to capture privileged data
  84. --version print version number and exit
  85. """
  86. print msg
  87. def childsnap_thread(logger, host):
  88. """
  89. Thread routine for each child snapshot being run on a remote host
  90. :param logger - Logging object
  91. :type logger - logging.Logger
  92. :param host - the hostname for remote host
  93. :type host - str
  94. """
  95. logger.info("Capturing snapshot from host %s" % (host))
  96. du = DshUtils()
  97. # Get path to pbs_snapshot on remote host
  98. host_pbsconf = du.parse_pbs_config(hostname=host)
  99. try:
  100. pbs_exec_path = host_pbsconf["PBS_EXEC"]
  101. except KeyError:
  102. logger.error("Couldn't find PBS_EXEC on host %s"
  103. ", won't capture snapshot on this host" % (
  104. host))
  105. return
  106. host_pbssnappath = os.path.join(pbs_exec_path, "sbin",
  107. "pbs_snapshot")
  108. # Create a directory on the remote host with a unique name
  109. # We will create the snapshot here
  110. timestamp = str(int(time.time()))
  111. snap_home = "host_" + timestamp
  112. du.mkdir(hostname=host, path=snap_home)
  113. # Run pbs_snapshot on the remote host
  114. cmd = [host_pbssnappath, "-o", snap_home,
  115. "--daemon-logs=" + str(daemon_logs),
  116. "--accounting-logs=" + str(acct_logs)]
  117. if anonymize:
  118. cmd.extend(["--obfuscate", "--map=" + map_file])
  119. if with_sudo:
  120. cmd.append("--with-sudo")
  121. ret = du.run_cmd(hosts=host, cmd=cmd, logerr=False)
  122. if ret['rc'] != 0:
  123. logger.error("Error capturing snapshot from host %s" % (host))
  124. print ret['err']
  125. return
  126. # Get the snapshot tar filename from stdout
  127. child_stdout = ret['out'][-1]
  128. snaptarname = child_stdout.split("Snapshot available at: ")[1]
  129. # Copy over the snapshot tar file as <hostname>_snapshot.tgz
  130. dest_path = os.path.join(out_dir, host + "_snapshot.tgz")
  131. src_path = host + ":" + snaptarname
  132. ret = du.run_copy(src=src_path, dest=dest_path)
  133. if ret['rc'] != 0:
  134. logger.error("Error copying child snapshot from host %s" % (host))
  135. # Copy over map file if any as 'host_<map filename>'
  136. if map_file is not None:
  137. dest_path = os.path.join(out_dir, host + "_" + map_file)
  138. src_path = os.path.join(snap_home, map_file)
  139. src_path = host + ":" + src_path
  140. ret = du.run_copy(src=src_path, dest=dest_path)
  141. if ret['rc'] != 0:
  142. logger.error("Error copying map file from host %s" % (host))
  143. # Delete the snapshot home from remote host
  144. du.rm(hostname=host, path=snap_home, recursive=True, force=True)
  145. if __name__ == '__main__':
  146. # Arguments to PBSSnapUtils
  147. out_dir = None
  148. primary_host = None
  149. log_level = "INFOCLI2"
  150. acct_logs = 30 # Capture 30 days of accounting logs by default
  151. daemon_logs = 5 # Capture 5 days of daemon logs by default
  152. additional_hosts = None
  153. map_file = None
  154. anonymize = False
  155. log_file = "pbs_snapshot.log"
  156. with_sudo = False
  157. PtlConfig()
  158. # Parse the options provided to pbs_snapshot
  159. try:
  160. sopt = "d:H:l:o:h"
  161. lopt = ["accounting-logs=", "daemon-logs=", "help",
  162. "additional-hosts=", "map=", "obfuscate", "with-sudo",
  163. "version"]
  164. opts, args = getopt.getopt(sys.argv[1:], sopt, lopt)
  165. except GetoptError:
  166. usage()
  167. sys.exit(1)
  168. for o, val in opts:
  169. if o == "-o":
  170. out_dir = val
  171. elif o == "-H":
  172. primary_host = val
  173. elif o == "-l":
  174. log_level = val
  175. elif o == "-h" or o == "--help":
  176. usage()
  177. sys.exit(0)
  178. elif o == "--accounting-logs":
  179. try:
  180. acct_logs = int(val)
  181. except ValueError:
  182. raise ValueError("Invalid value for --accounting-logs" +
  183. "option, should be an integer")
  184. elif o == "--daemon-logs":
  185. try:
  186. daemon_logs = int(val)
  187. except ValueError:
  188. raise ValueError("Invalid value for --daemon-logs" +
  189. "option, should be an integer")
  190. elif o == "--additional-hosts":
  191. additional_hosts = val
  192. elif o == "--map":
  193. map_file = val
  194. elif o == "--obfuscate":
  195. anonymize = True
  196. elif o == "--with-sudo":
  197. with_sudo = True
  198. elif o == "--version":
  199. print ptl.__version__
  200. sys.exit(0)
  201. else:
  202. sys.stderr.write("Unrecognized option")
  203. usage()
  204. sys.exit(1)
  205. # -o is a mandatory option, so make sure that it was provided
  206. if out_dir is None:
  207. sys.stderr.write("-o option not provided")
  208. usage()
  209. sys.exit(1)
  210. elif not os.path.isdir(out_dir):
  211. sys.stderr.write("-o path should exist,"
  212. " this is where the snapshot is captured")
  213. usage()
  214. sys.exit(1)
  215. fmt = '%(asctime)-15s %(levelname)-8s %(message)s'
  216. level_int = CliUtils.get_logging_level(log_level)
  217. log_path = os.path.join(out_dir, log_file)
  218. logging.basicConfig(filename=log_path, filemode='w+',
  219. level=level_int, format=fmt)
  220. stream_hdlr = logging.StreamHandler()
  221. stream_hdlr.setLevel(level_int)
  222. stream_hdlr.setFormatter(logging.Formatter(fmt))
  223. ptl_logger = logging.getLogger('ptl')
  224. ptl_logger.addHandler(stream_hdlr)
  225. ptl_logger.setLevel(level_int)
  226. if anonymize is True:
  227. # find the parent directory of the snapshot
  228. # This will be used to store the map file
  229. out_abspath = os.path.abspath(out_dir)
  230. if map_file is None:
  231. map_file = os.path.join(out_abspath, "obfuscate.map")
  232. if additional_hosts is not None:
  233. du = DshUtils()
  234. # Run child pbs_snapshot commands on those hosts
  235. hostnames = additional_hosts.split(",")
  236. childsnap_threads = {}
  237. for host in hostnames:
  238. thread = Thread(target=childsnap_thread, args=(
  239. ptl_logger, host))
  240. thread.start()
  241. childsnap_threads[host] = thread
  242. # Capture snapshot on the main host in the meantime
  243. with PBSSnapUtils(out_dir, primary_host=primary_host,
  244. acct_logs=acct_logs,
  245. daemon_logs=daemon_logs, map_file=map_file,
  246. anonymize=anonymize, create_tar=False,
  247. log_path=log_path,
  248. with_sudo=with_sudo) as snap_utils:
  249. main_snap = snap_utils.capture_all()
  250. # Let's reconcile the child snapshots
  251. for host, thread in childsnap_threads.iteritems():
  252. thread.join()
  253. host_snappath = os.path.join(out_dir, host + "_snapshot.tgz")
  254. if os.path.isfile(host_snappath):
  255. # Move the tar file to the main snapshot
  256. du.run_copy(src=host_snappath, dest=main_snap)
  257. # Remove the tar file
  258. du.rm(host_snappath, force=True)
  259. # Finally, create a tar of the whole snapshot
  260. outtar = main_snap + ".tgz"
  261. with tarfile.open(outtar, "w:gz") as tar:
  262. tar.add(main_snap, arcname=os.path.basename(main_snap))
  263. # Delete the snapshot directory itself
  264. du.rm(path=main_snap, recursive=True, force=True)
  265. else:
  266. # No additional hosts to capture
  267. with PBSSnapUtils(out_dir, primary_host=primary_host,
  268. acct_logs=acct_logs,
  269. daemon_logs=daemon_logs, map_file=map_file,
  270. anonymize=anonymize, create_tar=True,
  271. log_path=log_path,
  272. with_sudo=with_sudo) as snap_utils:
  273. outtar = snap_utils.capture_all()
  274. if outtar is not None:
  275. print "Snapshot available at: " + outtar