Browse Source

Merge pull request #5365 from sosiouxme/20170908-disconnected-image-check

Merged by openshift-bot
OpenShift Bot 7 years ago
parent
commit
da1b27cbc9

+ 1 - 1
roles/openshift_health_checker/action_plugins/openshift_health_check.py

@@ -187,7 +187,7 @@ def normalize(checks):
 
 def run_check(name, check, user_disabled_checks):
     """Run a single check if enabled and return a result dict."""
-    if name in user_disabled_checks:
+    if name in user_disabled_checks or '*' in user_disabled_checks:
         return dict(skipped=True, skipped_reason="Disabled by user request")
 
     # pylint: disable=broad-except; capturing exceptions broadly is intentional,

+ 17 - 0
roles/openshift_health_checker/openshift_checks/__init__.py

@@ -4,6 +4,7 @@ Health checks for OpenShift clusters.
 
 import operator
 import os
+import time
 
 from abc import ABCMeta, abstractmethod, abstractproperty
 from importlib import import_module
@@ -57,6 +58,9 @@ class OpenShiftCheck(object):
         self._execute_module = execute_module
         self.task_vars = task_vars or {}
         self.tmp = tmp
+        # mainly for testing purposes; see execute_module_with_retries
+        self._module_retries = 3
+        self._module_retry_interval = 5  # seconds
 
         # set to True when the check changes the host, for accurate total "changed" count
         self.changed = False
@@ -115,6 +119,19 @@ class OpenShiftCheck(object):
             )
         return self._execute_module(module_name, module_args, self.tmp, self.task_vars)
 
+    def execute_module_with_retries(self, module_name, module_args):
+        """Run execute_module and retry on failure."""
+        result = {}
+        tries = 0
+        while True:
+            res = self.execute_module(module_name, module_args)
+            if tries > self._module_retries or not res.get("failed"):
+                result.update(res)
+                return result
+            result["last_failed"] = res
+            tries += 1
+            time.sleep(self._module_retry_interval)
+
     def get_var(self, *keys, **kwargs):
         """Get deeply nested values from task_vars.
 

+ 59 - 30
roles/openshift_health_checker/openshift_checks/docker_image_availability.py

@@ -32,7 +32,12 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
     # we use python-docker-py to check local docker for images, and skopeo
     # to look for images available remotely without waiting to pull them.
     dependencies = ["python-docker-py", "skopeo"]
-    skopeo_img_check_command = "timeout 10 skopeo inspect --tls-verify=false"
+    skopeo_img_check_command = "timeout 10 skopeo inspect --tls-verify=false docker://{registry}/{image}"
+
+    def __init__(self, *args, **kwargs):
+        super(DockerImageAvailability, self).__init__(*args, **kwargs)
+        # record whether we could reach a registry or not (and remember results)
+        self.reachable_registries = {}
 
     def is_active(self):
         """Skip hosts with unsupported deployment types."""
@@ -64,15 +69,21 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
         unavailable_images = set(missing_images) - set(available_images)
 
         if unavailable_images:
-            return {
-                "failed": True,
-                "msg": (
-                    "One or more required Docker images are not available:\n    {}\n"
-                    "Configured registries: {}\n"
-                    "Checked by: {}"
-                ).format(",\n    ".join(sorted(unavailable_images)), ", ".join(registries),
-                         self.skopeo_img_check_command),
-            }
+            registries = [
+                reg if self.reachable_registries.get(reg, True) else reg + " (unreachable)"
+                for reg in registries
+            ]
+            msg = (
+                "One or more required Docker images are not available:\n    {}\n"
+                "Configured registries: {}\n"
+                "Checked by: {}"
+            ).format(
+                ",\n    ".join(sorted(unavailable_images)),
+                ", ".join(registries),
+                self.skopeo_img_check_command
+            )
+
+            return dict(failed=True, msg=msg)
 
         return {}
 
@@ -128,31 +139,31 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
 
     def local_images(self, images):
         """Filter a list of images and return those available locally."""
-        return [
-            image for image in images
-            if self.is_image_local(image)
-        ]
+        registries = self.known_docker_registries()
+        found_images = []
+        for image in images:
+            # docker could have the image name as-is or prefixed with any registry
+            imglist = [image] + [reg + "/" + image for reg in registries]
+            if self.is_image_local(imglist):
+                found_images.append(image)
+        return found_images
 
     def is_image_local(self, image):
         """Check if image is already in local docker index."""
         result = self.execute_module("docker_image_facts", {"name": image})
-        if result.get("failed", False):
-            return False
-
-        return bool(result.get("images", []))
+        return bool(result.get("images")) and not result.get("failed")
 
     def known_docker_registries(self):
         """Build a list of docker registries available according to inventory vars."""
-        docker_facts = self.get_var("openshift", "docker")
-        regs = set(docker_facts["additional_registries"])
+        regs = list(self.get_var("openshift.docker.additional_registries", default=[]))
 
         deployment_type = self.get_var("openshift_deployment_type")
-        if deployment_type == "origin":
-            regs.update(["docker.io"])
-        elif "enterprise" in deployment_type:
-            regs.update(["registry.access.redhat.com"])
+        if deployment_type == "origin" and "docker.io" not in regs:
+            regs.append("docker.io")
+        elif "enterprise" in deployment_type and "registry.access.redhat.com" not in regs:
+            regs.append("registry.access.redhat.com")
 
-        return list(regs)
+        return regs
 
     def available_images(self, images, default_registries):
         """Search remotely for images. Returns: list of images found."""
@@ -165,17 +176,35 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
         """Use Skopeo to determine if required image exists in known registry(s)."""
         registries = default_registries
 
-        # if image already includes a registry, only use that
+        # If image already includes a registry, only use that.
+        # NOTE: This logic would incorrectly identify images that do not use a namespace, e.g.
+        # registry.access.redhat.com/rhel7 as if the registry were a namespace.
+        # It's not clear that there's any way to distinguish them, but fortunately
+        # the current set of images all look like [registry/]namespace/name[:version].
         if image.count("/") > 1:
             registry, image = image.split("/", 1)
             registries = [registry]
 
         for registry in registries:
-            args = {
-                "_raw_params": self.skopeo_img_check_command + " docker://{}/{}".format(registry, image)
-            }
-            result = self.execute_module("command", args)
+            if registry not in self.reachable_registries:
+                self.reachable_registries[registry] = self.connect_to_registry(registry)
+            if not self.reachable_registries[registry]:
+                continue
+
+            args = {"_raw_params": self.skopeo_img_check_command.format(registry=registry, image=image)}
+            result = self.execute_module_with_retries("command", args)
             if result.get("rc", 0) == 0 and not result.get("failed"):
                 return True
+            if result.get("rc") == 124:  # RC 124 == timed out; mark unreachable
+                self.reachable_registries[registry] = False
 
         return False
+
+    def connect_to_registry(self, registry):
+        """Use ansible wait_for module to test connectivity from host to registry. Returns bool."""
+        # test a simple TCP connection
+        host, _, port = registry.partition(":")
+        port = port or 443
+        args = dict(host=host, port=port, state="started", timeout=30)
+        result = self.execute_module("wait_for", args)
+        return result.get("rc", 0) == 0 and not result.get("failed")

+ 1 - 1
roles/openshift_health_checker/openshift_checks/mixins.py

@@ -36,7 +36,7 @@ class DockerHostMixin(object):
 
         # NOTE: we would use the "package" module but it's actually an action plugin
         # and it's not clear how to invoke one of those. This is about the same anyway:
-        result = self.execute_module(
+        result = self.execute_module_with_retries(
             self.get_var("ansible_pkg_mgr", default="yum"),
             {"name": self.dependencies, "state": "present"},
         )

+ 1 - 1
roles/openshift_health_checker/openshift_checks/package_availability.py

@@ -26,7 +26,7 @@ class PackageAvailability(NotContainerizedMixin, OpenShiftCheck):
             packages.update(self.node_packages(rpm_prefix))
 
         args = {"packages": sorted(set(packages))}
-        return self.execute_module("check_yum_update", args)
+        return self.execute_module_with_retries("check_yum_update", args)
 
     @staticmethod
     def master_packages(rpm_prefix):

+ 1 - 1
roles/openshift_health_checker/openshift_checks/package_update.py

@@ -11,4 +11,4 @@ class PackageUpdate(NotContainerizedMixin, OpenShiftCheck):
 
     def run(self):
         args = {"packages": []}
-        return self.execute_module("check_yum_update", args)
+        return self.execute_module_with_retries("check_yum_update", args)

+ 1 - 1
roles/openshift_health_checker/openshift_checks/package_version.py

@@ -76,7 +76,7 @@ class PackageVersion(NotContainerizedMixin, OpenShiftCheck):
             ],
         }
 
-        return self.execute_module("aos_version", args)
+        return self.execute_module_with_retries("aos_version", args)
 
     def get_required_ovs_version(self):
         """Return the correct Open vSwitch version(s) for the current OpenShift version."""

+ 7 - 2
roles/openshift_health_checker/test/action_plugin_test.py

@@ -110,11 +110,16 @@ def test_action_plugin_skip_non_active_checks(plugin, task_vars, monkeypatch):
     assert not skipped(result)
 
 
-def test_action_plugin_skip_disabled_checks(plugin, task_vars, monkeypatch):
+@pytest.mark.parametrize('to_disable', [
+    'fake_check',
+    ['fake_check', 'spam'],
+    '*,spam,eggs',
+])
+def test_action_plugin_skip_disabled_checks(to_disable, plugin, task_vars, monkeypatch):
     checks = [fake_check('fake_check', is_active=True)]
     monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
 
-    task_vars['openshift_disable_check'] = 'fake_check'
+    task_vars['openshift_disable_check'] = to_disable
     result = plugin.run(tmp=None, task_vars=task_vars)
 
     assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Disabled by user request")

+ 88 - 99
roles/openshift_health_checker/test/docker_image_availability_test.py

@@ -3,6 +3,23 @@ import pytest
 from openshift_checks.docker_image_availability import DockerImageAvailability
 
 
+@pytest.fixture()
+def task_vars():
+    return dict(
+        openshift=dict(
+            common=dict(
+                service_type='origin',
+                is_containerized=False,
+                is_atomic=False,
+            ),
+            docker=dict(),
+        ),
+        openshift_deployment_type='origin',
+        openshift_image_tag='',
+        group_names=['nodes', 'masters'],
+    )
+
+
 @pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [
     ("origin", True, [], True),
     ("openshift-enterprise", True, [], True),
@@ -15,12 +32,10 @@ from openshift_checks.docker_image_availability import DockerImageAvailability
     ("origin", False, ["nodes", "masters"], True),
     ("openshift-enterprise", False, ["etcd"], False),
 ])
-def test_is_active(deployment_type, is_containerized, group_names, expect_active):
-    task_vars = dict(
-        openshift=dict(common=dict(is_containerized=is_containerized)),
-        openshift_deployment_type=deployment_type,
-        group_names=group_names,
-    )
+def test_is_active(task_vars, deployment_type, is_containerized, group_names, expect_active):
+    task_vars['openshift_deployment_type'] = deployment_type
+    task_vars['openshift']['common']['is_containerized'] = is_containerized
+    task_vars['group_names'] = group_names
     assert DockerImageAvailability(None, task_vars).is_active() == expect_active
 
 
@@ -30,10 +45,10 @@ def test_is_active(deployment_type, is_containerized, group_names, expect_active
     (True, False),
     (False, True),
 ])
-def test_all_images_available_locally(is_containerized, is_atomic):
+def test_all_images_available_locally(task_vars, is_containerized, is_atomic):
     def execute_module(module_name, module_args, *_):
         if module_name == "yum":
-            return {"changed": True}
+            return {}
 
         assert module_name == "docker_image_facts"
         assert 'name' in module_args
@@ -42,19 +57,9 @@ def test_all_images_available_locally(is_containerized, is_atomic):
             'images': [module_args['name']],
         }
 
-    result = DockerImageAvailability(execute_module, task_vars=dict(
-        openshift=dict(
-            common=dict(
-                service_type='origin',
-                is_containerized=is_containerized,
-                is_atomic=is_atomic,
-            ),
-            docker=dict(additional_registries=["docker.io"]),
-        ),
-        openshift_deployment_type='origin',
-        openshift_image_tag='3.4',
-        group_names=['nodes', 'masters'],
-    )).run()
+    task_vars['openshift']['common']['is_containerized'] = is_containerized
+    task_vars['openshift']['common']['is_atomic'] = is_atomic
+    result = DockerImageAvailability(execute_module, task_vars).run()
 
     assert not result.get('failed', False)
 
@@ -63,53 +68,36 @@ def test_all_images_available_locally(is_containerized, is_atomic):
     False,
     True,
 ])
-def test_all_images_available_remotely(available_locally):
+def test_all_images_available_remotely(task_vars, available_locally):
     def execute_module(module_name, *_):
         if module_name == 'docker_image_facts':
             return {'images': [], 'failed': available_locally}
-        return {'changed': False}
+        return {}
 
-    result = DockerImageAvailability(execute_module, task_vars=dict(
-        openshift=dict(
-            common=dict(
-                service_type='origin',
-                is_containerized=False,
-                is_atomic=False,
-            ),
-            docker=dict(additional_registries=["docker.io", "registry.access.redhat.com"]),
-        ),
-        openshift_deployment_type='origin',
-        openshift_image_tag='v3.4',
-        group_names=['nodes', 'masters'],
-    )).run()
+    task_vars['openshift']['docker']['additional_registries'] = ["docker.io", "registry.access.redhat.com"]
+    task_vars['openshift_image_tag'] = 'v3.4'
+    check = DockerImageAvailability(execute_module, task_vars)
+    check._module_retry_interval = 0
+    result = check.run()
 
     assert not result.get('failed', False)
 
 
-def test_all_images_unavailable():
-    def execute_module(module_name=None, *_):
-        if module_name == "command":
-            return {
-                'failed': True,
-            }
+def test_all_images_unavailable(task_vars):
+    def execute_module(module_name=None, *args):
+        if module_name == "wait_for":
+            return {}
+        elif module_name == "command":
+            return {'failed': True}
 
-        return {
-            'changed': False,
-        }
+        return {}  # docker_image_facts failure
 
-    actual = DockerImageAvailability(execute_module, task_vars=dict(
-        openshift=dict(
-            common=dict(
-                service_type='origin',
-                is_containerized=False,
-                is_atomic=False,
-            ),
-            docker=dict(additional_registries=["docker.io"]),
-        ),
-        openshift_deployment_type="openshift-enterprise",
-        openshift_image_tag='latest',
-        group_names=['nodes', 'masters'],
-    )).run()
+    task_vars['openshift']['docker']['additional_registries'] = ["docker.io"]
+    task_vars['openshift_deployment_type'] = "openshift-enterprise"
+    task_vars['openshift_image_tag'] = 'latest'
+    check = DockerImageAvailability(execute_module, task_vars)
+    check._module_retry_interval = 0
+    actual = check.run()
 
     assert actual['failed']
     assert "required Docker images are not available" in actual['msg']
@@ -125,62 +113,63 @@ def test_all_images_unavailable():
         ["dependencies can be installed via `yum`"]
     ),
 ])
-def test_skopeo_update_failure(message, extra_words):
+def test_skopeo_update_failure(task_vars, message, extra_words):
     def execute_module(module_name=None, *_):
         if module_name == "yum":
             return {
                 "failed": True,
                 "msg": message,
-                "changed": False,
             }
 
-        return {'changed': False}
+        return {}
 
-    actual = DockerImageAvailability(execute_module, task_vars=dict(
-        openshift=dict(
-            common=dict(
-                service_type='origin',
-                is_containerized=False,
-                is_atomic=False,
-            ),
-            docker=dict(additional_registries=["unknown.io"]),
-        ),
-        openshift_deployment_type="openshift-enterprise",
-        openshift_image_tag='',
-        group_names=['nodes', 'masters'],
-    )).run()
+    task_vars['openshift']['docker']['additional_registries'] = ["unknown.io"]
+    task_vars['openshift_deployment_type'] = "openshift-enterprise"
+    check = DockerImageAvailability(execute_module, task_vars)
+    check._module_retry_interval = 0
+    actual = check.run()
 
     assert actual["failed"]
     for word in extra_words:
         assert word in actual["msg"]
 
 
-@pytest.mark.parametrize("deployment_type,registries", [
-    ("origin", ["unknown.io"]),
-    ("openshift-enterprise", ["registry.access.redhat.com"]),
-    ("openshift-enterprise", []),
-])
-def test_registry_availability(deployment_type, registries):
+@pytest.mark.parametrize(
+    "image, registries, connection_test_failed, skopeo_failed, "
+    "expect_success, expect_registries_reached", [
+        (
+            "spam/eggs:v1", ["test.reg"],
+            True, True,
+            False,
+            {"test.reg": False},
+        ),
+        (
+            "spam/eggs:v1", ["test.reg"],
+            False, True,
+            False,
+            {"test.reg": True},
+        ),
+        (
+            "eggs.reg/spam/eggs:v1", ["test.reg"],
+            False, False,
+            True,
+            {"eggs.reg": True},
+        ),
+    ])
+def test_registry_availability(image, registries, connection_test_failed, skopeo_failed,
+                               expect_success, expect_registries_reached):
     def execute_module(module_name=None, *_):
-        return {
-            'changed': False,
-        }
+        if module_name == "wait_for":
+            return dict(msg="msg", failed=connection_test_failed)
+        elif module_name == "command":
+            return dict(msg="msg", failed=skopeo_failed)
 
-    actual = DockerImageAvailability(execute_module, task_vars=dict(
-        openshift=dict(
-            common=dict(
-                service_type='origin',
-                is_containerized=False,
-                is_atomic=False,
-            ),
-            docker=dict(additional_registries=registries),
-        ),
-        openshift_deployment_type=deployment_type,
-        openshift_image_tag='',
-        group_names=['nodes', 'masters'],
-    )).run()
+    check = DockerImageAvailability(execute_module, task_vars())
+    check._module_retry_interval = 0
 
-    assert not actual.get("failed", False)
+    available = check.is_available_skopeo_image(image, registries)
+    assert available == expect_success
+    assert expect_registries_reached == check.reachable_registries
 
 
 @pytest.mark.parametrize("deployment_type, is_containerized, groups, oreg_url, expected", [
@@ -257,7 +246,7 @@ def test_required_images(deployment_type, is_containerized, groups, oreg_url, ex
         openshift_image_tag='vtest',
     )
 
-    assert expected == DockerImageAvailability("DUMMY", task_vars).required_images()
+    assert expected == DockerImageAvailability(task_vars=task_vars).required_images()
 
 
 def test_containerized_etcd():
@@ -271,4 +260,4 @@ def test_containerized_etcd():
         group_names=['etcd'],
     )
     expected = set(['registry.access.redhat.com/rhel7/etcd'])
-    assert expected == DockerImageAvailability("DUMMY", task_vars).required_images()
+    assert expected == DockerImageAvailability(task_vars=task_vars).required_images()

+ 2 - 2
roles/openshift_health_checker/test/package_availability_test.py

@@ -56,7 +56,7 @@ def test_package_availability(task_vars, must_have_packages, must_not_have_packa
         assert 'packages' in module_args
         assert set(module_args['packages']).issuperset(must_have_packages)
         assert not set(module_args['packages']).intersection(must_not_have_packages)
-        return return_value
+        return {'foo': return_value}
 
     result = PackageAvailability(execute_module, task_vars).run()
-    assert result is return_value
+    assert result['foo'] is return_value

+ 2 - 2
roles/openshift_health_checker/test/package_update_test.py

@@ -9,7 +9,7 @@ def test_package_update():
         assert 'packages' in module_args
         # empty list of packages means "generic check if 'yum update' will work"
         assert module_args['packages'] == []
-        return return_value
+        return {'foo': return_value}
 
     result = PackageUpdate(execute_module).run()
-    assert result is return_value
+    assert result['foo'] is return_value

+ 4 - 4
roles/openshift_health_checker/test/package_version_test.py

@@ -52,7 +52,7 @@ def test_invalid_openshift_release_format():
 ])
 def test_package_version(openshift_release):
 
-    return_value = object()
+    return_value = {"foo": object()}
 
     def execute_module(module_name=None, module_args=None, tmp=None, task_vars=None, *_):
         assert module_name == 'aos_version'
@@ -66,7 +66,7 @@ def test_package_version(openshift_release):
 
     check = PackageVersion(execute_module, task_vars_for(openshift_release, 'origin'))
     result = check.run()
-    assert result is return_value
+    assert result == return_value
 
 
 @pytest.mark.parametrize('deployment_type,openshift_release,expected_docker_version', [
@@ -79,7 +79,7 @@ def test_package_version(openshift_release):
 ])
 def test_docker_package_version(deployment_type, openshift_release, expected_docker_version):
 
-    return_value = object()
+    return_value = {"foo": object()}
 
     def execute_module(module_name=None, module_args=None, *_):
         assert module_name == 'aos_version'
@@ -93,7 +93,7 @@ def test_docker_package_version(deployment_type, openshift_release, expected_doc
 
     check = PackageVersion(execute_module, task_vars_for(openshift_release, deployment_type))
     result = check.run()
-    assert result is return_value
+    assert result == return_value
 
 
 @pytest.mark.parametrize('group_names,is_containerized,is_active', [