Browse Source

Merge pull request #3787 from juanvallejo/jvallejo/docker-storage-check

Merged by openshift-bot
OpenShift Bot 7 years ago
parent
commit
2f0b5b9b6b

+ 3 - 19
roles/openshift_health_checker/openshift_checks/docker_image_availability.py

@@ -1,8 +1,9 @@
 # pylint: disable=missing-docstring
 from openshift_checks import OpenShiftCheck, get_var
+from openshift_checks.mixins import DockerHostMixin
 
 
-class DockerImageAvailability(OpenShiftCheck):
+class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):
     """Check that required Docker images are available.
 
     This check attempts to ensure that required docker images are
@@ -36,19 +37,11 @@ class DockerImageAvailability(OpenShiftCheck):
 
     def run(self, tmp, task_vars):
         msg, failed, changed = self.ensure_dependencies(task_vars)
-
-        # exit early if Skopeo update fails
         if failed:
-            if "No package matching" in msg:
-                msg = "Ensure that all required dependencies can be installed via `yum`.\n"
             return {
                 "failed": True,
                 "changed": changed,
-                "msg": (
-                    "Unable to update or install required dependency packages on this host;\n"
-                    "These are required in order to check Docker image availability:"
-                    "\n    {deps}\n{msg}"
-                ).format(deps=',\n    '.join(self.dependencies), msg=msg),
+                "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg
             }
 
         required_images = self.required_images(task_vars)
@@ -168,12 +161,3 @@ class DockerImageAvailability(OpenShiftCheck):
         args = {"_raw_params": cmd_str}
         result = self.module_executor("command", args, task_vars)
         return not result.get("failed", False) and result.get("rc", 0) == 0
-
-    # ensures that the skopeo and python-docker-py packages exist
-    # check is skipped on atomic installations
-    def ensure_dependencies(self, task_vars):
-        if get_var(task_vars, "openshift", "common", "is_atomic"):
-            return "", False, False
-
-        result = self.module_executor("yum", {"name": self.dependencies, "state": "latest"}, task_vars)
-        return result.get("msg", ""), result.get("failed", False) or result.get("rc", 0) != 0, result.get("changed")

+ 185 - 0
roles/openshift_health_checker/openshift_checks/docker_storage.py

@@ -0,0 +1,185 @@
+"""Check Docker storage driver and usage."""
+import json
+import re
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+from openshift_checks.mixins import DockerHostMixin
+
+
+class DockerStorage(DockerHostMixin, OpenShiftCheck):
+    """Check Docker storage driver compatibility.
+
+    This check ensures that Docker is using a supported storage driver,
+    and that loopback is not being used (if using devicemapper).
+    Also that storage usage is not above threshold.
+    """
+
+    name = "docker_storage"
+    tags = ["pre-install", "health", "preflight"]
+
+    dependencies = ["python-docker-py"]
+    storage_drivers = ["devicemapper", "overlay2"]
+    max_thinpool_data_usage_percent = 90.0
+    max_thinpool_meta_usage_percent = 90.0
+
+    # pylint: disable=too-many-return-statements
+    # Reason: permanent stylistic exception;
+    #         it is clearer to return on failures and there are just many ways to fail here.
+    def run(self, tmp, task_vars):
+        msg, failed, changed = self.ensure_dependencies(task_vars)
+        if failed:
+            return {
+                "failed": True,
+                "changed": changed,
+                "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg
+            }
+
+        # attempt to get the docker info hash from the API
+        info = self.execute_module("docker_info", {}, task_vars)
+        if info.get("failed"):
+            return {"failed": True, "changed": changed,
+                    "msg": "Failed to query Docker API. Is docker running on this host?"}
+        if not info.get("info"):  # this would be very strange
+            return {"failed": True, "changed": changed,
+                    "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
+        info = info["info"]
+
+        # check if the storage driver we saw is valid
+        driver = info.get("Driver", "[NONE]")
+        if driver not in self.storage_drivers:
+            msg = (
+                "Detected unsupported Docker storage driver '{driver}'.\n"
+                "Supported storage drivers are: {drivers}"
+            ).format(driver=driver, drivers=', '.join(self.storage_drivers))
+            return {"failed": True, "changed": changed, "msg": msg}
+
+        # driver status info is a list of tuples; convert to dict and validate based on driver
+        driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+        if driver == "devicemapper":
+            if driver_status.get("Data loop file"):
+                msg = (
+                    "Use of loopback devices with the Docker devicemapper storage driver\n"
+                    "(the default storage configuration) is unsupported in production.\n"
+                    "Please use docker-storage-setup to configure a backing storage volume.\n"
+                    "See http://red.ht/2rNperO for further information."
+                )
+                return {"failed": True, "changed": changed, "msg": msg}
+            result = self._check_dm_usage(driver_status, task_vars)
+            result['changed'] = result.get('changed', False) or changed
+            return result
+
+        # TODO(lmeyer): determine how to check usage for overlay2
+
+        return {"changed": changed}
+
+    def _check_dm_usage(self, driver_status, task_vars):
+        """
+        Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
+        implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
+        devicemapper storage. The LV is "thin" because it does not use all available storage
+        from its VG, instead expanding as needed; so to determine available space, we gather
+        current usage as the Docker API reports for the driver as well as space available for
+        expansion in the pool's VG.
+        Usage within the LV is divided into pools allocated to data and metadata, either of which
+        could run out of space first; so we check both.
+        """
+        vals = dict(
+            vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+            data_used=driver_status.get("Data Space Used"),
+            data_total=driver_status.get("Data Space Total"),
+            metadata_used=driver_status.get("Metadata Space Used"),
+            metadata_total=driver_status.get("Metadata Space Total"),
+        )
+
+        # convert all human-readable strings to bytes
+        for key, value in vals.copy().items():
+            try:
+                vals[key + "_bytes"] = self._convert_to_bytes(value)
+            except ValueError as err:  # unlikely to hit this from API info, but just to be safe
+                return {
+                    "failed": True,
+                    "values": vals,
+                    "msg": "Could not interpret {} value '{}' as bytes: {}".format(key, value, str(err))
+                }
+
+        # determine the threshold percentages which usage should not exceed
+        for name, default in [("data", self.max_thinpool_data_usage_percent),
+                              ("metadata", self.max_thinpool_meta_usage_percent)]:
+            percent = get_var(task_vars, "max_thinpool_" + name + "_usage_percent", default=default)
+            try:
+                vals[name + "_threshold"] = float(percent)
+            except ValueError:
+                return {
+                    "failed": True,
+                    "msg": "Specified thinpool {} usage limit '{}' is not a percentage".format(name, percent)
+                }
+
+        # test whether the thresholds are exceeded
+        messages = []
+        for name in ["data", "metadata"]:
+            vals[name + "_pct_used"] = 100 * vals[name + "_used_bytes"] / (
+                vals[name + "_total_bytes"] + vals["vg_free_bytes"])
+            if vals[name + "_pct_used"] > vals[name + "_threshold"]:
+                messages.append(
+                    "Docker thinpool {name} usage percentage {pct:.1f} "
+                    "is higher than threshold {thresh:.1f}.".format(
+                        name=name,
+                        pct=vals[name + "_pct_used"],
+                        thresh=vals[name + "_threshold"],
+                    ))
+                vals["failed"] = True
+
+        vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
+        return vals
+
+    def _get_vg_free(self, pool, task_vars):
+        # Determine which VG to examine according to the pool name, the only indicator currently
+        # available from the Docker API driver info. We assume a name that looks like
+        # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+        match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen
+        if not match:  # unlikely, but... be clear if we assumed wrong
+            raise OpenShiftCheckException(
+                "This host's Docker reports it is using a storage pool named '{}'.\n"
+                "However this name does not have the expected format of 'vgname-lvname'\n"
+                "so the available storage in the VG cannot be determined.".format(pool)
+            )
+        vg_name = match.groups()[0].replace("--", "-")
+        vgs_cmd = "/sbin/vgs --noheadings -o vg_free --select vg_name=" + vg_name
+        # should return free space like "  12.00g" if the VG exists; empty if it does not
+
+        ret = self.execute_module("command", {"_raw_params": vgs_cmd}, task_vars)
+        if ret.get("failed") or ret.get("rc", 0) != 0:
+            raise OpenShiftCheckException(
+                "Is LVM installed? Failed to run /sbin/vgs "
+                "to determine docker storage usage:\n" + ret.get("msg", "")
+            )
+        size = ret.get("stdout", "").strip()
+        if not size:
+            raise OpenShiftCheckException(
+                "This host's Docker reports it is using a storage pool named '{pool}'.\n"
+                "which we expect to come from local VG '{vg}'.\n"
+                "However, /sbin/vgs did not find this VG. Is Docker for this host"
+                "running and using the storage on the host?".format(pool=pool, vg=vg_name)
+            )
+        return size
+
+    @staticmethod
+    def _convert_to_bytes(string):
+        units = dict(
+            b=1,
+            k=1024,
+            m=1024**2,
+            g=1024**3,
+            t=1024**4,
+            p=1024**5,
+        )
+        string = string or ""
+        match = re.match(r'(\d+(?:\.\d+)?)\s*(\w)?', string)  # float followed by optional unit
+        if not match:
+            raise ValueError("Cannot convert to a byte size: " + string)
+
+        number, unit = match.groups()
+        multiplier = 1 if not unit else units.get(unit.lower())
+        if not multiplier:
+            raise ValueError("Cannot convert to a byte size: " + string)
+
+        return float(number) * multiplier

+ 41 - 1
roles/openshift_health_checker/openshift_checks/mixins.py

@@ -1,4 +1,3 @@
-# pylint: disable=missing-docstring,too-few-public-methods
 """
 Mixin classes meant to be used with subclasses of OpenShiftCheck.
 """
@@ -8,8 +7,49 @@ from openshift_checks import get_var
 
 class NotContainerizedMixin(object):
     """Mixin for checks that are only active when not in containerized mode."""
+    # permanent # pylint: disable=too-few-public-methods
+    # Reason: The mixin is not intended to stand on its own as a class.
 
     @classmethod
     def is_active(cls, task_vars):
+        """Only run on non-containerized hosts."""
         is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
         return super(NotContainerizedMixin, cls).is_active(task_vars) and not is_containerized
+
+
+class DockerHostMixin(object):
+    """Mixin for checks that are only active on hosts that require Docker."""
+
+    dependencies = []
+
+    @classmethod
+    def is_active(cls, task_vars):
+        """Only run on hosts that depend on Docker."""
+        is_containerized = get_var(task_vars, "openshift", "common", "is_containerized")
+        is_node = "nodes" in get_var(task_vars, "group_names", default=[])
+        return super(DockerHostMixin, cls).is_active(task_vars) and (is_containerized or is_node)
+
+    def ensure_dependencies(self, task_vars):
+        """
+        Ensure that docker-related packages exist, but not on atomic hosts
+        (which would not be able to install but should already have them).
+        Returns: msg, failed, changed
+        """
+        if get_var(task_vars, "openshift", "common", "is_atomic"):
+            return "", False, False
+
+        # NOTE: we would use the "package" module but it's actually an action plugin
+        # and it's not clear how to invoke one of those. This is about the same anyway:
+        pkg_manager = get_var(task_vars, "ansible_pkg_mgr", default="yum")
+        result = self.module_executor(pkg_manager, {"name": self.dependencies, "state": "present"}, task_vars)
+        msg = result.get("msg", "")
+        if result.get("failed"):
+            if "No package matching" in msg:
+                msg = "Ensure that all required dependencies can be installed via `yum`.\n"
+            msg = (
+                "Unable to install required packages on this host:\n"
+                "    {deps}\n{msg}"
+            ).format(deps=',\n    '.join(self.dependencies), msg=msg)
+        failed = result.get("failed", False) or result.get("rc", 0) != 0
+        changed = result.get("changed", False)
+        return msg, failed, changed

+ 15 - 9
roles/openshift_health_checker/test/docker_image_availability_test.py

@@ -3,19 +3,25 @@ import pytest
 from openshift_checks.docker_image_availability import DockerImageAvailability
 
 
-@pytest.mark.parametrize('deployment_type,is_active', [
-    ("origin", True),
-    ("openshift-enterprise", True),
-    ("enterprise", False),
-    ("online", False),
-    ("invalid", False),
-    ("", False),
+@pytest.mark.parametrize('deployment_type, is_containerized, group_names, expect_active', [
+    ("origin", True, [], True),
+    ("openshift-enterprise", True, [], True),
+    ("enterprise", True, [], False),
+    ("online", True, [], False),
+    ("invalid", True, [], False),
+    ("", True, [], False),
+    ("origin", False, [], False),
+    ("openshift-enterprise", False, [], False),
+    ("origin", False, ["nodes", "masters"], True),
+    ("openshift-enterprise", False, ["etcd"], False),
 ])
-def test_is_active(deployment_type, is_active):
+def test_is_active(deployment_type, is_containerized, group_names, expect_active):
     task_vars = dict(
+        openshift=dict(common=dict(is_containerized=is_containerized)),
         openshift_deployment_type=deployment_type,
+        group_names=group_names,
     )
-    assert DockerImageAvailability.is_active(task_vars=task_vars) == is_active
+    assert DockerImageAvailability.is_active(task_vars=task_vars) == expect_active
 
 
 @pytest.mark.parametrize("is_containerized,is_atomic", [

+ 224 - 0
roles/openshift_health_checker/test/docker_storage_test.py

@@ -0,0 +1,224 @@
+import pytest
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.docker_storage import DockerStorage
+
+
+def dummy_check(execute_module=None):
+    def dummy_exec(self, status, task_vars):
+        raise Exception("dummy executor called")
+    return DockerStorage(execute_module=execute_module or dummy_exec)
+
+
+@pytest.mark.parametrize('is_containerized, group_names, is_active', [
+    (False, ["masters", "etcd"], False),
+    (False, ["masters", "nodes"], True),
+    (True, ["etcd"], True),
+])
+def test_is_active(is_containerized, group_names, is_active):
+    task_vars = dict(
+        openshift=dict(common=dict(is_containerized=is_containerized)),
+        group_names=group_names,
+    )
+    assert DockerStorage.is_active(task_vars=task_vars) == is_active
+
+
+non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
+
+
+@pytest.mark.parametrize('docker_info, failed, expect_msg', [
+    (
+        dict(failed=True, msg="Error connecting: Error while fetching server API version"),
+        True,
+        ["Is docker running on this host?"],
+    ),
+    (
+        dict(msg="I have no info"),
+        True,
+        ["missing info"],
+    ),
+    (
+        dict(info={
+            "Driver": "devicemapper",
+            "DriverStatus": [("Pool Name", "docker-docker--pool")],
+        }),
+        False,
+        [],
+    ),
+    (
+        dict(info={
+            "Driver": "devicemapper",
+            "DriverStatus": [("Data loop file", "true")],
+        }),
+        True,
+        ["loopback devices with the Docker devicemapper storage driver"],
+    ),
+    (
+        dict(info={
+            "Driver": "overlay2",
+            "DriverStatus": []
+        }),
+        False,
+        [],
+    ),
+    (
+        dict(info={
+            "Driver": "overlay",
+        }),
+        True,
+        ["unsupported Docker storage driver"],
+    ),
+    (
+        dict(info={
+            "Driver": "unsupported",
+        }),
+        True,
+        ["unsupported Docker storage driver"],
+    ),
+])
+def test_check_storage_driver(docker_info, failed, expect_msg):
+    def execute_module(module_name, args, tmp=None, task_vars=None):
+        if module_name == "yum":
+            return {}
+        if module_name != "docker_info":
+            raise ValueError("not expecting module " + module_name)
+        return docker_info
+
+    check = dummy_check(execute_module=execute_module)
+    check._check_dm_usage = lambda status, task_vars: dict()  # stub out for this test
+    result = check.run(tmp=None, task_vars=non_atomic_task_vars)
+
+    if failed:
+        assert result["failed"]
+    else:
+        assert not result.get("failed", False)
+
+    for word in expect_msg:
+        assert word in result["msg"]
+
+
+enough_space = {
+    "Pool Name": "docker--vg-docker--pool",
+    "Data Space Used": "19.92 MB",
+    "Data Space Total": "8.535 GB",
+    "Metadata Space Used": "40.96 kB",
+    "Metadata Space Total": "25.17 MB",
+}
+
+not_enough_space = {
+    "Pool Name": "docker--vg-docker--pool",
+    "Data Space Used": "10 GB",
+    "Data Space Total": "10 GB",
+    "Metadata Space Used": "42 kB",
+    "Metadata Space Total": "43 kB",
+}
+
+
+@pytest.mark.parametrize('task_vars, driver_status, vg_free, success, expect_msg', [
+    (
+        {"max_thinpool_data_usage_percent": "not a float"},
+        enough_space,
+        "12g",
+        False,
+        ["is not a percentage"],
+    ),
+    (
+        {},
+        {},  # empty values from driver status
+        "bogus",  # also does not parse as bytes
+        False,
+        ["Could not interpret", "as bytes"],
+    ),
+    (
+        {},
+        enough_space,
+        "12.00g",
+        True,
+        [],
+    ),
+    (
+        {},
+        not_enough_space,
+        "0.00",
+        False,
+        ["data usage", "metadata usage", "higher than threshold"],
+    ),
+])
+def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
+    check = dummy_check()
+    check._get_vg_free = lambda pool, task_vars: vg_free
+    result = check._check_dm_usage(driver_status, task_vars)
+    result_success = not result.get("failed")
+
+    assert result_success is success
+    for msg in expect_msg:
+        assert msg in result["msg"]
+
+
+@pytest.mark.parametrize('pool, command_returns, raises, returns', [
+    (
+        "foo-bar",
+        {  # vgs missing
+            "msg": "[Errno 2] No such file or directory",
+            "failed": True,
+            "cmd": "/sbin/vgs",
+            "rc": 2,
+        },
+        "Failed to run /sbin/vgs",
+        None,
+    ),
+    (
+        "foo",  # no hyphen in name - should not happen
+        {},
+        "name does not have the expected format",
+        None,
+    ),
+    (
+        "foo-bar",
+        dict(stdout="  4.00g\n"),
+        None,
+        "4.00g",
+    ),
+    (
+        "foo-bar",
+        dict(stdout="\n"),  # no matching VG
+        "vgs did not find this VG",
+        None,
+    )
+])
+def test_vg_free(pool, command_returns, raises, returns):
+    def execute_module(module_name, args, tmp=None, task_vars=None):
+        if module_name != "command":
+            raise ValueError("not expecting module " + module_name)
+        return command_returns
+
+    check = dummy_check(execute_module=execute_module)
+    if raises:
+        with pytest.raises(OpenShiftCheckException) as err:
+            check._get_vg_free(pool, {})
+        assert raises in str(err.value)
+    else:
+        ret = check._get_vg_free(pool, {})
+        assert ret == returns
+
+
+@pytest.mark.parametrize('string, expect_bytes', [
+    ("12", 12.0),
+    ("12 k", 12.0 * 1024),
+    ("42.42 MB", 42.42 * 1024**2),
+    ("12g", 12.0 * 1024**3),
+])
+def test_convert_to_bytes(string, expect_bytes):
+    got = DockerStorage._convert_to_bytes(string)
+    assert got == expect_bytes
+
+
+@pytest.mark.parametrize('string', [
+    "bork",
+    "42 Qs",
+])
+def test_convert_to_bytes_error(string):
+    with pytest.raises(ValueError) as err:
+        DockerStorage._convert_to_bytes(string)
+    assert "Cannot convert" in str(err.value)
+    assert string in str(err.value)