pbs_loganalyzer 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  4. # For more information, contact Altair at www.altair.com.
  5. #
  6. # This file is part of the PBS Professional ("PBS Pro") software.
  7. #
  8. # Open Source License Information:
  9. #
  10. # PBS Pro is free software. You can redistribute it and/or modify it under the
  11. # terms of the GNU Affero General Public License as published by the Free
  12. # Software Foundation, either version 3 of the License, or (at your option) any
  13. # later version.
  14. #
  15. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  16. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17. # FOR A PARTICULAR PURPOSE.
  18. # See the GNU Affero General Public License for more details.
  19. #
  20. # You should have received a copy of the GNU Affero General Public License
  21. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. #
  23. # Commercial License Information:
  24. #
  25. # For a copy of the commercial license terms and conditions,
  26. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  27. # or contact the Altair Legal Department.
  28. #
  29. # Altair’s dual-license business model allows companies, individuals, and
  30. # organizations to create proprietary derivative works of PBS Pro and
  31. # distribute them - whether embedded or bundled with other software -
  32. # under a commercial license agreement.
  33. #
  34. # Use of Altair’s trademarks, including but not limited to "PBS™",
  35. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  36. # trademark licensing policies.
  37. import getopt
  38. import sys
  39. import os
  40. import traceback
  41. import logging
  42. import logging.config
  43. import time
  44. import errno
  45. import ptl
  46. from ptl.utils.pbs_logutils import PBSLogUtils, PBSLogAnalyzer
  47. from ptl.utils.pbs_cliutils import CliUtils
  48. from ptl.utils.plugins.ptl_test_db import PTLTestDb
  49. from ptl.lib.pbs_testlib import PtlConfig
  50. # trap SIGINT and SIGPIPE
  51. def trap_exceptions(etype, value, tb):
  52. sys.excepthook = sys.__excepthook__
  53. if issubclass(etype, KeyboardInterrupt):
  54. pass
  55. elif issubclass(etype, IOError) and value.errno == errno.EPIPE:
  56. pass
  57. else:
  58. sys.__excepthook__(etype, value, tb)
  59. sys.excepthook = trap_exceptions
  60. def usage():
  61. msg = []
  62. msg += ['Usage: ' + os.path.basename(sys.argv[0]).split('.pyc')[0]]
  63. msg += [' [OPTION]\n\n']
  64. msg += [' Analyze PBS logs and return various throughput metrics\n\n']
  65. msg += ['-a <acctlog>: path to accounting log file/dir to analyze\n']
  66. msg += ['-b: process log from corresponding begin/start time\n']
  67. msg += [' format: %m/%d/%Y %H:%M:%S\n']
  68. msg += ['-c: output cycle summary\n']
  69. msg += ['-d <diag>: path to a pbs_diag directory\n']
  70. msg += ['-e: process log up to corresponding end time\n']
  71. msg += [' format: %m/%d/%Y %H:%M:%S\n']
  72. msg += ['-f <log>: generic log file for analysis\n']
  73. msg += ['-h: display usage information\n']
  74. msg += ['-t <hostname>: hostname. Defaults to FQDN local hostname\n']
  75. msg += ['-l <schedlog>: path to scheduler log file/dir to analyze\n']
  76. msg += ['-m <momlog>: path to mom log file/dir to analyze\n']
  77. msg += ['-s <serverlog>: path to server log file/dir to analyze\n']
  78. msg += ['-S: show per job scheduling details, time to '
  79. 'run/discard/calendar\n']
  80. msg += ['-U: show utilization. Requires paths to jobs and nodes info\n']
  81. msg += ['--estimated-info: show job start time estimate info. '
  82. 'Requires scheduler log(s)\n']
  83. msg += ['--estimated-info-only: write only estimated info to the DB.'
  84. ' Requires --db-out\n']
  85. msg += ['--last-week: analyze logs of the last 7 days\n']
  86. msg += ['--last-month: analyze logs of the last month\n']
  87. msg += ['--re-interval=<regexp>: report time interval between '
  88. 'occurrences of regexp\n']
  89. msg += ['--re-frequency=<seconds>: report frequency of occurrences of '
  90. 'the re-interval\n']
  91. msg += [' expression for every <seconds>\n']
  92. msg += ['--silent: do not display progress bar. Defaults to False\n']
  93. msg += ['--log-conf=<file>: logging config file\n']
  94. msg += ['--nodes-file=<path>: path to file with output of pbsnodes -av\n']
  95. msg += ['--jobs-file=<path>: path to file with output of qstat -f\n']
  96. msg += ['--db-out=<file>: send results to db file\n']
  97. msg += ['--db-type=<type>: database type\n']
  98. msg += ['--db-access=<path>: Path to a file that defines db options '
  99. '(PostreSQL only)\n']
  100. msg += ['--version: print version number and exit\n']
  101. print "".join(msg)
  102. if __name__ == '__main__':
  103. if len(sys.argv) < 2:
  104. usage()
  105. sys.exit(0)
  106. diag = None
  107. schedulerlog = None
  108. serverlog = None
  109. momlog = None
  110. acctlog = None
  111. genericlog = None
  112. hostname = None
  113. sj = False
  114. compact = False
  115. begin = None
  116. end = None
  117. cyclesummary = False
  118. nodesfile = None
  119. jobsfile = None
  120. utilization = None
  121. silent = False
  122. logconf = None
  123. estimated_info = False
  124. estimated_info_only = False
  125. dbout = None
  126. dbtype = None
  127. dbaccess = None
  128. re_interval = None
  129. re_frequency = None
  130. re_conditional = None
  131. json_on = False
  132. level = logging.FATAL
  133. logutils = PBSLogUtils()
  134. dbutils = PTLTestDb()
  135. try:
  136. shortopt = "a:b:d:e:f:t:l:L:s:m:cShU"
  137. longopt = ["nodes-file=", "jobs-file=", "version", "log-conf=",
  138. "estimated-info", "db-out=", "json", "re-interval=",
  139. "re-frequency=", "last-week", "last-month",
  140. "re-conditional=", "estimated-info-only", "silent",
  141. "db-type=", "db-access="]
  142. opts, args = getopt.getopt(sys.argv[1:], shortopt, longopt)
  143. except:
  144. usage()
  145. sys.exit(1)
  146. for o, val in opts:
  147. if o == '-a':
  148. acctlog = CliUtils.expand_abs_path(val)
  149. elif o == '-b':
  150. try:
  151. begin = logutils.convert_date_time(val)
  152. except:
  153. print('Error converting time, expected format '
  154. '%m/%d/%Y %H:%M:%S')
  155. sys.exit(1)
  156. elif o == '-e':
  157. try:
  158. end = logutils.convert_date_time(val)
  159. except:
  160. print('Error converting time, expected format '
  161. '%m/%d/%Y %H:%M:%S')
  162. print traceback.print_exc()
  163. sys.exit(1)
  164. elif o == '-d':
  165. diag = CliUtils.expand_abs_path(val)
  166. elif o == '-f':
  167. genericlog = CliUtils.expand_abs_path(val)
  168. elif o == '-t':
  169. hostname = val
  170. elif o == '-l':
  171. schedulerlog = CliUtils.expand_abs_path(val)
  172. elif o == '-s':
  173. serverlog = CliUtils.expand_abs_path(val)
  174. elif o == '-m':
  175. momlog = CliUtils.expand_abs_path(val)
  176. elif o == '-c':
  177. cyclesummary = True
  178. elif o == '-C':
  179. compact = True
  180. elif o == '-L':
  181. level = CliUtils.get_logging_level(val)
  182. elif o == '-S':
  183. sj = True
  184. elif o == '-U':
  185. utilization = True
  186. elif o == '--db-out':
  187. dbout = CliUtils.expand_abs_path(val)
  188. elif o == '--db-type':
  189. dbtype = val
  190. elif o == '--db-access':
  191. dbaccess = CliUtils.expand_abs_path(val)
  192. elif o == '--estimated-info':
  193. estimated_info = True
  194. elif o == '--estimated-info-only':
  195. estimated_info_only = True
  196. elif o == '--json':
  197. json_on = True
  198. elif o == '--last-week':
  199. s = time.localtime(time.time() - (7 * 24 * 3600))
  200. begin = int(time.mktime(time.strptime(time.strftime("%m/%d/%Y", s),
  201. "%m/%d/%Y")))
  202. end = int(time.time())
  203. elif o == '--last-month':
  204. s = time.localtime(time.time() - (30 * 24 * 3600))
  205. begin = int(time.mktime(time.strptime(time.strftime("%m/%d/%Y", s),
  206. "%m/%d/%Y")))
  207. end = int(time.time())
  208. elif o == '--log-conf':
  209. logconf = CliUtils.expand_abs_path(val)
  210. elif o == '--nodes-file':
  211. nodesfile = CliUtils.expand_abs_path(val)
  212. elif o == '--jobs-file':
  213. jobsfile = CliUtils.expand_abs_path(val)
  214. elif o == '--re-conditional':
  215. re_conditional = eval(val, {}, {})
  216. elif o == '--re-interval':
  217. re_interval = val
  218. elif o == '--silent':
  219. silent = True
  220. elif o == '--re-frequency':
  221. re_frequency = int(val)
  222. elif o == '--version':
  223. print ptl.__version__
  224. sys.exit(0)
  225. elif o == '-h':
  226. usage()
  227. sys.exit(0)
  228. else:
  229. sys.stderr.write("Unrecognized option " + o)
  230. usage()
  231. sys.exit(1)
  232. if logconf:
  233. logging.config.fileConfig(logconf)
  234. else:
  235. logging.basicConfig(level=level)
  236. PtlConfig()
  237. if diag:
  238. if nodesfile is None:
  239. if os.path.isfile(os.path.join(diag, 'pbsnodes_va.out')):
  240. nodesfile = os.path.join(diag, 'pbsnodes_va.out')
  241. if jobsfile is None:
  242. if os.path.isfile(os.path.join(diag, 'qstat_f.out')):
  243. jobsfile = os.path.join(diag, 'qstat_f.out')
  244. if ((re_interval is not None or re_conditional is not None) and
  245. genericlog is None):
  246. if schedulerlog is not None:
  247. genericlog = schedulerlog
  248. schedulerlog = None
  249. elif serverlog is not None:
  250. genericlog = serverlog
  251. serverlog = None
  252. elif momlog is not None:
  253. genericlog = momlog
  254. momlog = None
  255. elif acctlog is not None:
  256. genericlog = acctlog
  257. acctlog = None
  258. show_progress = not silent
  259. pla = PBSLogAnalyzer(schedulerlog, serverlog, momlog, acctlog,
  260. genericlog, hostname, show_progress)
  261. if utilization:
  262. if acctlog is None:
  263. logging.error("Accounting log is required to compute utilization")
  264. sys.exit(1)
  265. pla.accounting.enable_utilization_parsing(hostname, nodesfile,
  266. jobsfile)
  267. if re_interval is not None:
  268. pla.set_custom_match(re_interval, re_frequency)
  269. if re_conditional is not None:
  270. pla.set_conditional_match(re_conditional)
  271. if estimated_info or estimated_info_only:
  272. if schedulerlog is None:
  273. logging.error("Scheduler log is required for estimated start time "
  274. "analysis")
  275. sys.exit(1)
  276. pla.scheduler.estimated_parsing_enabled = True
  277. if estimated_info_only:
  278. pla.scheduler.parse_estimated_only = True
  279. info = pla.analyze_logs(start=begin, end=end, showjob=sj)
  280. if genericlog:
  281. dbutils.process_output(pla.info)
  282. # Drift analysis and custom regex matching require additional
  283. # post-processing and can't currently be passed through to JSON
  284. if json_on:
  285. if cyclesummary:
  286. info['scheduler'] = info['scheduler']['summary']
  287. print CliUtils.__json__(info)
  288. sys.exit(0)
  289. if acctlog:
  290. dbutils.process_output(info['accounting'], dbout, dbtype, dbaccess,
  291. name=acctlog, logtype='accounting')
  292. if schedulerlog:
  293. dbutils.process_output(info['scheduler'], dbout, dbtype, dbaccess,
  294. name=schedulerlog, logtype='scheduler',
  295. summary=cyclesummary)
  296. if serverlog:
  297. dbutils.process_output(info['server'], dbout, dbtype, dbaccess,
  298. name=serverlog, logtype='server')
  299. if momlog:
  300. dbutils.process_output(info['mom'], dbout, dbtype, dbaccess,
  301. name=momlog, logtype='mom')