openshift_health_check.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. """
  2. Ansible action plugin to execute health checks in OpenShift clusters.
  3. """
  4. import sys
  5. import os
  6. import traceback
  7. from collections import defaultdict
  8. from ansible.plugins.action import ActionBase
  9. from ansible.module_utils.six import string_types
  10. try:
  11. from __main__ import display
  12. except ImportError:
  13. # pylint: disable=ungrouped-imports; this is the standard way how to import
  14. # the default display object in Ansible action plugins.
  15. from ansible.utils.display import Display
  16. display = Display()
  17. # Augment sys.path so that we can import checks from a directory relative to
  18. # this callback plugin.
  19. sys.path.insert(1, os.path.dirname(os.path.dirname(__file__)))
  20. # pylint: disable=wrong-import-position; the import statement must come after
  21. # the manipulation of sys.path.
  22. from openshift_checks import OpenShiftCheck, OpenShiftCheckException, load_checks # noqa: E402
  23. class ActionModule(ActionBase):
  24. """Action plugin to execute health checks."""
  25. def run(self, tmp=None, task_vars=None):
  26. result = super(ActionModule, self).run(tmp, task_vars)
  27. task_vars = task_vars or {}
  28. # callback plugins cannot read Ansible vars, but we would like
  29. # zz_failure_summary to have access to certain values. We do so by
  30. # storing the information we need in the result.
  31. result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context')
  32. try:
  33. known_checks = self.load_known_checks(tmp, task_vars)
  34. args = self._task.args
  35. requested_checks = normalize(args.get('checks', []))
  36. if not requested_checks:
  37. result['failed'] = True
  38. result['msg'] = list_known_checks(known_checks)
  39. return result
  40. resolved_checks = resolve_checks(requested_checks, known_checks.values())
  41. except OpenShiftCheckException as exc:
  42. result["failed"] = True
  43. result["msg"] = str(exc)
  44. return result
  45. if "openshift" not in task_vars:
  46. result["failed"] = True
  47. result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?"
  48. return result
  49. result["checks"] = check_results = {}
  50. user_disabled_checks = normalize(task_vars.get('openshift_disable_check', []))
  51. for name in resolved_checks:
  52. display.banner("CHECK [{} : {}]".format(name, task_vars["ansible_host"]))
  53. check = known_checks[name]
  54. check_results[name] = run_check(name, check, user_disabled_checks)
  55. if check.changed:
  56. check_results[name]["changed"] = True
  57. result["changed"] = any(r.get("changed") for r in check_results.values())
  58. if any(r.get("failed") for r in check_results.values()):
  59. result["failed"] = True
  60. result["msg"] = "One or more checks failed"
  61. return result
  62. def load_known_checks(self, tmp, task_vars):
  63. """Find all existing checks and return a mapping of names to instances."""
  64. load_checks()
  65. known_checks = {}
  66. for cls in OpenShiftCheck.subclasses():
  67. name = cls.name
  68. if name in known_checks:
  69. other_cls = known_checks[name].__class__
  70. raise OpenShiftCheckException(
  71. "duplicate check name '{}' in: '{}' and '{}'"
  72. "".format(name, full_class_name(cls), full_class_name(other_cls))
  73. )
  74. known_checks[name] = cls(execute_module=self._execute_module, tmp=tmp, task_vars=task_vars)
  75. return known_checks
  76. def list_known_checks(known_checks):
  77. """Return text listing the existing checks and tags."""
  78. # TODO: we could include a description of each check by taking it from a
  79. # check class attribute (e.g., __doc__) when building the message below.
  80. msg = (
  81. 'This playbook is meant to run health checks, but no checks were '
  82. 'requested. Set the `openshift_checks` variable to a comma-separated '
  83. 'list of check names or a YAML list. Available checks:\n {}'
  84. ).format('\n '.join(sorted(known_checks)))
  85. tags = describe_tags(known_checks.values())
  86. msg += (
  87. '\n\nTags can be used as a shortcut to select multiple '
  88. 'checks. Available tags and the checks they select:\n {}'
  89. ).format('\n '.join(tags))
  90. return msg
  91. def describe_tags(check_classes):
  92. """Return a sorted list of strings describing tags and the checks they include."""
  93. tag_checks = defaultdict(list)
  94. for cls in check_classes:
  95. for tag in cls.tags:
  96. tag_checks[tag].append(cls.name)
  97. tags = [
  98. '@{} = {}'.format(tag, ','.join(sorted(checks)))
  99. for tag, checks in tag_checks.items()
  100. ]
  101. return sorted(tags)
  102. def resolve_checks(names, all_checks):
  103. """Returns a set of resolved check names.
  104. Resolving a check name expands tag references (e.g., "@tag") to all the
  105. checks that contain the given tag. OpenShiftCheckException is raised if
  106. names contains an unknown check or tag name.
  107. names should be a sequence of strings.
  108. all_checks should be a sequence of check classes/instances.
  109. """
  110. known_check_names = set(check.name for check in all_checks)
  111. known_tag_names = set(name for check in all_checks for name in check.tags)
  112. check_names = set(name for name in names if not name.startswith('@'))
  113. tag_names = set(name[1:] for name in names if name.startswith('@'))
  114. unknown_check_names = check_names - known_check_names
  115. unknown_tag_names = tag_names - known_tag_names
  116. if unknown_check_names or unknown_tag_names:
  117. msg = []
  118. if unknown_check_names:
  119. msg.append('Unknown check names: {}.'.format(', '.join(sorted(unknown_check_names))))
  120. if unknown_tag_names:
  121. msg.append('Unknown tag names: {}.'.format(', '.join(sorted(unknown_tag_names))))
  122. msg.append('Make sure there is no typo in the playbook and no files are missing.')
  123. # TODO: implement a "Did you mean ...?" when the input is similar to a
  124. # valid check or tag.
  125. msg.append('Known checks:')
  126. msg.append(' {}'.format('\n '.join(sorted(known_check_names))))
  127. msg.append('Known tags:')
  128. msg.append(' {}'.format('\n '.join(describe_tags(all_checks))))
  129. raise OpenShiftCheckException('\n'.join(msg))
  130. tag_to_checks = defaultdict(set)
  131. for check in all_checks:
  132. for tag in check.tags:
  133. tag_to_checks[tag].add(check.name)
  134. resolved = check_names.copy()
  135. for tag in tag_names:
  136. resolved.update(tag_to_checks[tag])
  137. return resolved
  138. def normalize(checks):
  139. """Return a clean list of check names.
  140. The input may be a comma-separated string or a sequence. Leading and
  141. trailing whitespace characters are removed. Empty items are discarded.
  142. """
  143. if isinstance(checks, string_types):
  144. checks = checks.split(',')
  145. return [name.strip() for name in checks if name.strip()]
  146. def run_check(name, check, user_disabled_checks):
  147. """Run a single check if enabled and return a result dict."""
  148. if name in user_disabled_checks or '*' in user_disabled_checks:
  149. return dict(skipped=True, skipped_reason="Disabled by user request")
  150. # pylint: disable=broad-except; capturing exceptions broadly is intentional,
  151. # to isolate arbitrary failures in one check from others.
  152. try:
  153. is_active = check.is_active()
  154. except Exception as exc:
  155. reason = "Could not determine if check should be run, exception: {}".format(exc)
  156. return dict(skipped=True, skipped_reason=reason, exception=traceback.format_exc())
  157. if not is_active:
  158. return dict(skipped=True, skipped_reason="Not active for this host")
  159. try:
  160. return check.run()
  161. except OpenShiftCheckException as exc:
  162. return dict(failed=True, msg=str(exc))
  163. except Exception as exc:
  164. return dict(failed=True, msg=str(exc), exception=traceback.format_exc())
  165. def full_class_name(cls):
  166. """Return the name of a class prefixed with its module name."""
  167. return '{}.{}'.format(cls.__module__, cls.__name__)