Selaa lähdekoodia

health checks: add diagnostics check

Also, moved is_first_master method into superclass for reuse. And look
at oo_first_master and ansible_host instead of masters and
ansible_ssh_host.
Luke Meyer 7 vuotta sitten
vanhempi
commit
9698f76b64

+ 5 - 6
roles/openshift_health_checker/library/ocutil.py

@@ -40,18 +40,17 @@ def main():
 
     module = AnsibleModule(
         argument_spec=dict(
-            namespace=dict(type="str", required=True),
+            namespace=dict(type="str", required=False),
             config_file=dict(type="str", required=True),
             cmd=dict(type="str", required=True),
             extra_args=dict(type="list", default=[]),
         ),
     )
 
-    cmd = [
-        locate_oc_binary(),
-        '--config', module.params["config_file"],
-        '-n', module.params["namespace"],
-    ] + shlex.split(module.params["cmd"])
+    cmd = [locate_oc_binary(), '--config', module.params["config_file"]]
+    if module.params["namespace"]:
+        cmd += ['-n', module.params["namespace"]]
+    cmd += shlex.split(module.params["cmd"]) + module.params["extra_args"]
 
     failed = True
     try:

+ 17 - 0
roles/openshift_health_checker/openshift_checks/__init__.py

@@ -13,6 +13,7 @@ from importlib import import_module
 
 from ansible.module_utils import six
 from ansible.module_utils.six.moves import reduce  # pylint: disable=import-error,redefined-builtin
+from ansible.module_utils.six import string_types
 from ansible.plugins.filter.core import to_bool as ansible_to_bool
 
 
@@ -110,6 +111,11 @@ class OpenShiftCheck(object):
         """Returns true if this check applies to the ansible-playbook run."""
         return True
 
+    def is_first_master(self):
+        """Determine if running on first master. Returns: bool"""
+        masters = self.get_var("groups", "oo_first_master", default=None) or [None]
+        return masters[0] == self.get_var("ansible_host")
+
     @abstractmethod
     def run(self):
         """Executes a check against a host and returns a result hash similar to Ansible modules.
@@ -283,6 +289,17 @@ class OpenShiftCheck(object):
                 ))
 
     @staticmethod
+    def normalize(name_list):
+        """Return a clean list of names.
+
+        The input may be a comma-separated string or a sequence. Leading and
+        trailing whitespace characters are removed. Empty items are discarded.
+        """
+        if isinstance(name_list, string_types):
+            name_list = name_list.split(',')
+        return [name.strip() for name in name_list if name.strip()]
+
+    @staticmethod
     def get_major_minor_version(openshift_image_tag):
         """Parse and return the deployed version of OpenShift as a tuple."""
         if openshift_image_tag and openshift_image_tag[0] == 'v':

+ 62 - 0
roles/openshift_health_checker/openshift_checks/diagnostics.py

@@ -0,0 +1,62 @@
+"""
+A check to run relevant diagnostics via `oc adm diagnostics`.
+"""
+
+import os
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+
+
+DIAGNOSTIC_LIST = (
+    "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles "
+    "ClusterRouter DiagnosticPod NetworkCheck"
+).split()
+
+
+class DiagnosticCheck(OpenShiftCheck):
+    """A check to run relevant diagnostics via `oc adm diagnostics`."""
+
+    name = "diagnostics"
+    tags = ["health"]
+
+    def is_active(self):
+        return super(DiagnosticCheck, self).is_active() and self.is_first_master()
+
+    def run(self):
+        if self.exec_diagnostic("ConfigContexts"):
+            # only run the other diagnostics if that one succeeds (otherwise, all will fail)
+            diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST)
+            for diagnostic in self.normalize(diagnostics):
+                self.exec_diagnostic(diagnostic)
+        return {}
+
+    def exec_diagnostic(self, diagnostic):
+        """
+        Execute an 'oc adm diagnostics' command on the remote host.
+        Raises OcNotFound or registers OcDiagFailed.
+        Returns True on success or False on failure (non-zero rc).
+        """
+        config_base = self.get_var("openshift.common.config_base")
+        args = {
+            "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
+            "cmd": "adm diagnostics",
+            "extra_args": [diagnostic],
+        }
+
+        result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json")
+        self.register_file(diagnostic + ".txt", result['result'])
+        if result.get("failed"):
+            if result['result'] == '[Errno 2] No such file or directory':
+                raise OpenShiftCheckException(
+                    "OcNotFound",
+                    "This host is supposed to be a master but does not have the `oc` command where expected.\n"
+                    "Has an installation been run on this host yet?"
+                )
+
+            self.register_failure(OpenShiftCheckException(
+                'OcDiagFailed',
+                'The {diag} diagnostic reported an error:\n'
+                '{error}'.format(diag=diagnostic, error=result['result'])
+            ))
+            return False
+        return True

+ 1 - 1
roles/openshift_health_checker/openshift_checks/etcd_volume.py

@@ -16,7 +16,7 @@ class EtcdVolume(OpenShiftCheck):
 
     def is_active(self):
         etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or []
-        is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts
+        is_etcd_host = self.get_var("ansible_host") in etcd_hosts
         return super(EtcdVolume, self).is_active() and is_etcd_host
 
     def run(self):

+ 0 - 8
roles/openshift_health_checker/openshift_checks/logging/logging.py

@@ -30,14 +30,6 @@ class LoggingCheck(OpenShiftCheck):
         logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False)
         return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master()
 
-    def is_first_master(self):
-        """Determine if running on first master. Returns: bool"""
-        # Note: It would be nice to use membership in oo_first_master group, however for now it
-        # seems best to avoid requiring that setup and just check this is the first master.
-        hostname = self.get_var("ansible_ssh_host") or [None]
-        masters = self.get_var("groups", "masters", default=None) or [None]
-        return masters[0] == hostname
-
     def run(self):
         return {}
 

+ 50 - 0
roles/openshift_health_checker/test/diagnostics_test.py

@@ -0,0 +1,50 @@
+import pytest
+
+from openshift_checks.diagnostics import DiagnosticCheck, OpenShiftCheckException
+
+
+@pytest.fixture()
+def task_vars():
+    return dict(
+        openshift=dict(
+            common=dict(config_base="/etc/origin/")
+        )
+    )
+
+
+def test_module_succeeds(task_vars):
+    check = DiagnosticCheck(lambda *_: {"result": "success"}, task_vars)
+    check.is_first_master = lambda: True
+    assert check.is_active()
+    check.exec_diagnostic("spam")
+    assert not check.failures
+
+
+def test_oc_not_there(task_vars):
+    def exec_module(*_):
+        return {"failed": True, "result": "[Errno 2] No such file or directory"}
+
+    check = DiagnosticCheck(exec_module, task_vars)
+    with pytest.raises(OpenShiftCheckException) as excinfo:
+        check.exec_diagnostic("spam")
+    assert excinfo.value.name == "OcNotFound"
+
+
+def test_module_fails(task_vars):
+    def exec_module(*_):
+        return {"failed": True, "result": "something broke"}
+
+    check = DiagnosticCheck(exec_module, task_vars)
+    check.exec_diagnostic("spam")
+    assert check.failures and check.failures[0].name == "OcDiagFailed"
+
+
+def test_names_executed(task_vars):
+    task_vars["openshift_check_diagnostics"] = diagnostics = "ConfigContexts,spam,,eggs"
+
+    def exec_module(module, args, *_):
+        assert "extra_args" in args
+        assert args["extra_args"][0] in diagnostics
+        return {"result": "success"}
+
+    DiagnosticCheck(exec_module, task_vars).run()

+ 3 - 5
roles/openshift_health_checker/test/logging_check_test.py

@@ -98,21 +98,19 @@ def test_oc_failure(problem, expect):
     assert expect in str(excinfo)
 
 
-groups_with_first_master = dict(masters=['this-host', 'other-host'])
-groups_with_second_master = dict(masters=['other-host', 'this-host'])
-groups_not_a_master = dict(masters=['other-host'])
+groups_with_first_master = dict(oo_first_master=['this-host'])
+groups_not_a_master = dict(oo_first_master=['other-host'], oo_masters=['other-host'])
 
 
 @pytest.mark.parametrize('groups, logging_deployed, is_active', [
     (groups_with_first_master, True, True),
     (groups_with_first_master, False, False),
     (groups_not_a_master, True, False),
-    (groups_with_second_master, True, False),
     (groups_not_a_master, True, False),
 ])
 def test_is_active(groups, logging_deployed, is_active):
     task_vars = dict(
-        ansible_ssh_host='this-host',
+        ansible_host='this-host',
         groups=groups,
         openshift_hosted_logging_deploy=logging_deployed,
     )