openshift_health_check.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. """
  2. Ansible action plugin to execute health checks in OpenShift clusters.
  3. """
  4. import sys
  5. import os
  6. import base64
  7. import traceback
  8. import errno
  9. import json
  10. from collections import defaultdict
  11. from ansible.plugins.action import ActionBase
  12. from ansible.module_utils.six import string_types
  13. try:
  14. from __main__ import display
  15. except ImportError:
  16. # pylint: disable=ungrouped-imports; this is the standard way how to import
  17. # the default display object in Ansible action plugins.
  18. from ansible.utils.display import Display
  19. display = Display()
  20. # Augment sys.path so that we can import checks from a directory relative to
  21. # this callback plugin.
  22. sys.path.insert(1, os.path.dirname(os.path.dirname(__file__)))
  23. # pylint: disable=wrong-import-position; the import statement must come after
  24. # the manipulation of sys.path.
  25. from openshift_checks import OpenShiftCheck, OpenShiftCheckException, load_checks # noqa: E402
  26. class ActionModule(ActionBase):
  27. """Action plugin to execute health checks."""
  28. def run(self, tmp=None, task_vars=None):
  29. result = super(ActionModule, self).run(tmp, task_vars)
  30. task_vars = task_vars or {}
  31. # callback plugins cannot read Ansible vars, but we would like
  32. # zz_failure_summary to have access to certain values. We do so by
  33. # storing the information we need in the result.
  34. result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context')
  35. # if the user wants to write check results to files, they provide this directory:
  36. output_dir = task_vars.get("openshift_checks_output_dir")
  37. if output_dir:
  38. output_dir = os.path.join(output_dir, task_vars["ansible_host"])
  39. try:
  40. known_checks = self.load_known_checks(tmp, task_vars, output_dir)
  41. args = self._task.args
  42. requested_checks = normalize(args.get('checks', []))
  43. if not requested_checks:
  44. result['failed'] = True
  45. result['msg'] = list_known_checks(known_checks)
  46. return result
  47. resolved_checks = resolve_checks(requested_checks, known_checks.values())
  48. except OpenShiftCheckException as exc:
  49. result["failed"] = True
  50. result["msg"] = str(exc)
  51. return result
  52. if "openshift" not in task_vars:
  53. result["failed"] = True
  54. result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?"
  55. return result
  56. result["checks"] = check_results = {}
  57. user_disabled_checks = normalize(task_vars.get('openshift_disable_check', []))
  58. # Automatically add docker_storage if only CRIO is used, as docker service would be stopped
  59. if task_vars.get('openshift_use_crio_only'):
  60. user_disabled_checks.append('docker_storage')
  61. for name in resolved_checks:
  62. display.banner("CHECK [{} : {}]".format(name, task_vars["ansible_host"]))
  63. check_results[name] = run_check(name, known_checks[name], user_disabled_checks, output_dir)
  64. result["changed"] = any(r.get("changed") for r in check_results.values())
  65. if any(r.get("failed") for r in check_results.values()):
  66. result["failed"] = True
  67. result["msg"] = "One or more checks failed"
  68. write_result_to_output_dir(output_dir, result)
  69. return result
  70. def load_known_checks(self, tmp, task_vars, output_dir=None):
  71. """Find all existing checks and return a mapping of names to instances."""
  72. load_checks()
  73. want_full_results = bool(output_dir)
  74. known_checks = {}
  75. for cls in OpenShiftCheck.subclasses():
  76. name = cls.name
  77. if name in known_checks:
  78. other_cls = known_checks[name].__class__
  79. raise OpenShiftCheckException(
  80. "duplicate check name '{}' in: '{}' and '{}'"
  81. "".format(name, full_class_name(cls), full_class_name(other_cls))
  82. )
  83. known_checks[name] = cls(
  84. execute_module=self._execute_module,
  85. tmp=tmp,
  86. task_vars=task_vars,
  87. want_full_results=want_full_results,
  88. templar=self._templar
  89. )
  90. return known_checks
  91. def list_known_checks(known_checks):
  92. """Return text listing the existing checks and tags."""
  93. # TODO: we could include a description of each check by taking it from a
  94. # check class attribute (e.g., __doc__) when building the message below.
  95. msg = (
  96. 'This playbook is meant to run health checks, but no checks were '
  97. 'requested. Set the `openshift_checks` variable to a comma-separated '
  98. 'list of check names or a YAML list. Available checks:\n {}'
  99. ).format('\n '.join(sorted(known_checks)))
  100. tags = describe_tags(known_checks.values())
  101. msg += (
  102. '\n\nTags can be used as a shortcut to select multiple '
  103. 'checks. Available tags and the checks they select:\n {}'
  104. ).format('\n '.join(tags))
  105. return msg
  106. def describe_tags(check_classes):
  107. """Return a sorted list of strings describing tags and the checks they include."""
  108. tag_checks = defaultdict(list)
  109. for cls in check_classes:
  110. for tag in cls.tags:
  111. tag_checks[tag].append(cls.name)
  112. tags = [
  113. '@{} = {}'.format(tag, ','.join(sorted(checks)))
  114. for tag, checks in tag_checks.items()
  115. ]
  116. return sorted(tags)
  117. def resolve_checks(names, all_checks):
  118. """Returns a set of resolved check names.
  119. Resolving a check name expands tag references (e.g., "@tag") to all the
  120. checks that contain the given tag. OpenShiftCheckException is raised if
  121. names contains an unknown check or tag name.
  122. names should be a sequence of strings.
  123. all_checks should be a sequence of check classes/instances.
  124. """
  125. known_check_names = set(check.name for check in all_checks)
  126. known_tag_names = set(name for check in all_checks for name in check.tags)
  127. check_names = set(name for name in names if not name.startswith('@'))
  128. tag_names = set(name[1:] for name in names if name.startswith('@'))
  129. unknown_check_names = check_names - known_check_names
  130. unknown_tag_names = tag_names - known_tag_names
  131. if unknown_check_names or unknown_tag_names:
  132. msg = []
  133. if unknown_check_names:
  134. msg.append('Unknown check names: {}.'.format(', '.join(sorted(unknown_check_names))))
  135. if unknown_tag_names:
  136. msg.append('Unknown tag names: {}.'.format(', '.join(sorted(unknown_tag_names))))
  137. msg.append('Make sure there is no typo in the playbook and no files are missing.')
  138. # TODO: implement a "Did you mean ...?" when the input is similar to a
  139. # valid check or tag.
  140. msg.append('Known checks:')
  141. msg.append(' {}'.format('\n '.join(sorted(known_check_names))))
  142. msg.append('Known tags:')
  143. msg.append(' {}'.format('\n '.join(describe_tags(all_checks))))
  144. raise OpenShiftCheckException('\n'.join(msg))
  145. tag_to_checks = defaultdict(set)
  146. for check in all_checks:
  147. for tag in check.tags:
  148. tag_to_checks[tag].add(check.name)
  149. resolved = check_names.copy()
  150. for tag in tag_names:
  151. resolved.update(tag_to_checks[tag])
  152. return resolved
  153. def normalize(checks):
  154. """Return a clean list of check names.
  155. The input may be a comma-separated string or a sequence. Leading and
  156. trailing whitespace characters are removed. Empty items are discarded.
  157. """
  158. if isinstance(checks, string_types):
  159. checks = checks.split(',')
  160. return [name.strip() for name in checks if name.strip()]
  161. def run_check(name, check, user_disabled_checks, output_dir=None):
  162. """Run a single check if enabled and return a result dict."""
  163. # determine if we're going to run the check (not inactive or disabled)
  164. if name in user_disabled_checks or '*' in user_disabled_checks:
  165. return dict(skipped=True, skipped_reason="Disabled by user request")
  166. # pylint: disable=broad-except; capturing exceptions broadly is intentional,
  167. # to isolate arbitrary failures in one check from others.
  168. try:
  169. is_active = check.is_active()
  170. except Exception as exc:
  171. reason = "Could not determine if check should be run, exception: {}".format(exc)
  172. return dict(skipped=True, skipped_reason=reason, exception=traceback.format_exc())
  173. if not is_active:
  174. return dict(skipped=True, skipped_reason="Not active for this host")
  175. # run the check
  176. result = {}
  177. try:
  178. result = check.run()
  179. except OpenShiftCheckException as exc:
  180. check.register_failure(exc)
  181. except Exception as exc:
  182. check.register_failure("\n".join([str(exc), traceback.format_exc()]))
  183. # process the check state; compose the result hash, write files as needed
  184. if check.changed:
  185. result["changed"] = True
  186. if check.failures or result.get("failed"):
  187. if "msg" in result: # failure result has msg; combine with any registered failures
  188. check.register_failure(result.get("msg"))
  189. result["failures"] = [(fail.name, str(fail)) for fail in check.failures]
  190. result["failed"] = True
  191. result["msg"] = "\n".join(str(fail) for fail in check.failures)
  192. write_to_output_file(output_dir, name + ".failures.json", result["failures"])
  193. if check.logs:
  194. write_to_output_file(output_dir, name + ".log.json", check.logs)
  195. if check.files_to_save:
  196. write_files_to_save(output_dir, check)
  197. return result
  198. def prepare_output_dir(dirname):
  199. """Create the directory, including parents. Return bool for success/failure."""
  200. try:
  201. os.makedirs(dirname)
  202. return True
  203. except OSError as exc:
  204. # trying to create existing dir leads to error;
  205. # that error is fine, but for any other, assume the dir is not there
  206. return exc.errno == errno.EEXIST
  207. def copy_remote_file_to_dir(check, file_to_save, output_dir, fname):
  208. """Copy file from remote host to local file in output_dir, if given."""
  209. if not output_dir or not prepare_output_dir(output_dir):
  210. return
  211. local_file = os.path.join(output_dir, fname)
  212. # pylint: disable=broad-except; do not need to do anything about failure to write dir/file
  213. # and do not want exceptions to break anything.
  214. try:
  215. # NOTE: it would have been nice to copy the file directly without loading it into
  216. # memory, but there does not seem to be a good way to do this via ansible.
  217. result = check.execute_module("slurp", dict(src=file_to_save), register=False)
  218. if result.get("failed"):
  219. display.warning("Could not retrieve file {}: {}".format(file_to_save, result.get("msg")))
  220. return
  221. content = result["content"]
  222. if result.get("encoding") == "base64":
  223. content = base64.b64decode(content)
  224. with open(local_file, "wb") as outfile:
  225. outfile.write(content)
  226. except Exception as exc:
  227. display.warning("Failed writing remote {} to local {}: {}".format(file_to_save, local_file, exc))
  228. return
  229. def _no_fail(obj):
  230. # pylint: disable=broad-except; do not want serialization to fail for any reason
  231. try:
  232. return str(obj)
  233. except Exception:
  234. return "[not serializable]"
  235. def write_to_output_file(output_dir, filename, data):
  236. """If output_dir provided, write data to file. Serialize as JSON if data is not a string."""
  237. if not output_dir or not prepare_output_dir(output_dir):
  238. return
  239. filename = os.path.join(output_dir, filename)
  240. try:
  241. with open(filename, 'w') as outfile:
  242. if isinstance(data, string_types):
  243. outfile.write(data)
  244. else:
  245. json.dump(data, outfile, sort_keys=True, indent=4, default=_no_fail)
  246. # pylint: disable=broad-except; do not want serialization/write to break for any reason
  247. except Exception as exc:
  248. display.warning("Could not write output file {}: {}".format(filename, exc))
  249. def write_result_to_output_dir(output_dir, result):
  250. """If output_dir provided, write the result as json to result.json.
  251. Success/failure of the write is recorded as "output_files" in the result hash afterward.
  252. Otherwise this is much like write_to_output_file.
  253. """
  254. if not output_dir:
  255. return
  256. if not prepare_output_dir(output_dir):
  257. result["output_files"] = "Error creating output directory " + output_dir
  258. return
  259. filename = os.path.join(output_dir, "result.json")
  260. try:
  261. with open(filename, 'w') as outfile:
  262. json.dump(result, outfile, sort_keys=True, indent=4, default=_no_fail)
  263. result["output_files"] = "Check results for this host written to " + filename
  264. # pylint: disable=broad-except; do not want serialization/write to break for any reason
  265. except Exception as exc:
  266. result["output_files"] = "Error writing check results to {}:\n{}".format(filename, exc)
  267. def write_files_to_save(output_dir, check):
  268. """Write files to check subdir in output dir."""
  269. if not output_dir:
  270. return
  271. output_dir = os.path.join(output_dir, check.name)
  272. seen_file = defaultdict(lambda: 0)
  273. for file_to_save in check.files_to_save:
  274. fname = file_to_save.filename
  275. while seen_file[fname]: # just to be sure we never re-write a file, append numbers as needed
  276. seen_file[fname] += 1
  277. fname = "{}.{}".format(fname, seen_file[fname])
  278. seen_file[fname] += 1
  279. if file_to_save.remote_filename:
  280. copy_remote_file_to_dir(check, file_to_save.remote_filename, output_dir, fname)
  281. else:
  282. write_to_output_file(output_dir, fname, file_to_save.contents)
  283. def full_class_name(cls):
  284. """Return the name of a class prefixed with its module name."""
  285. return '{}.{}'.format(cls.__module__, cls.__name__)