Parcourir la source

openshift_checks/docker_storage: overlay/2 support

fix bug 1469197
https://bugzilla.redhat.com/show_bug.cgi?id=1469197

When Docker is configured with the overlay or overlay2 storage driver,
check that it is supported and usage is below threshold.
Luke Meyer il y a 7 ans
Parent
commit
2b1c7492c3

+ 145 - 32
roles/openshift_health_checker/openshift_checks/docker_storage.py

@@ -1,5 +1,6 @@
 """Check Docker storage driver and usage."""
 """Check Docker storage driver and usage."""
 import json
 import json
+import os.path
 import re
 import re
 from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
 from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
 from openshift_checks.mixins import DockerHostMixin
 from openshift_checks.mixins import DockerHostMixin
@@ -20,10 +21,27 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
     storage_drivers = ["devicemapper", "overlay", "overlay2"]
     storage_drivers = ["devicemapper", "overlay", "overlay2"]
     max_thinpool_data_usage_percent = 90.0
     max_thinpool_data_usage_percent = 90.0
     max_thinpool_meta_usage_percent = 90.0
     max_thinpool_meta_usage_percent = 90.0
+    max_overlay_usage_percent = 90.0
+
+    # TODO(lmeyer): mention these in the output when check fails
+    configuration_variables = [
+        (
+            "max_thinpool_data_usage_percent",
+            "For 'devicemapper' storage driver, usage threshold percentage for data. "
+            "Format: float. Default: {:.1f}".format(max_thinpool_data_usage_percent),
+        ),
+        (
+            "max_thinpool_meta_usage_percent",
+            "For 'devicemapper' storage driver, usage threshold percentage for metadata. "
+            "Format: float. Default: {:.1f}".format(max_thinpool_meta_usage_percent),
+        ),
+        (
+            "max_overlay_usage_percent",
+            "For 'overlay' or 'overlay2' storage driver, usage threshold percentage. "
+            "Format: float. Default: {:.1f}".format(max_overlay_usage_percent),
+        ),
+    ]
 
 
-    # pylint: disable=too-many-return-statements
-    # Reason: permanent stylistic exception;
-    #         it is clearer to return on failures and there are just many ways to fail here.
     def run(self, tmp, task_vars):
     def run(self, tmp, task_vars):
         msg, failed, changed = self.ensure_dependencies(task_vars)
         msg, failed, changed = self.ensure_dependencies(task_vars)
         if failed:
         if failed:
@@ -34,17 +52,17 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
             }
             }
 
 
         # attempt to get the docker info hash from the API
         # attempt to get the docker info hash from the API
-        info = self.execute_module("docker_info", {}, task_vars=task_vars)
-        if info.get("failed"):
+        docker_info = self.execute_module("docker_info", {}, task_vars=task_vars)
+        if docker_info.get("failed"):
             return {"failed": True, "changed": changed,
             return {"failed": True, "changed": changed,
                     "msg": "Failed to query Docker API. Is docker running on this host?"}
                     "msg": "Failed to query Docker API. Is docker running on this host?"}
-        if not info.get("info"):  # this would be very strange
+        if not docker_info.get("info"):  # this would be very strange
             return {"failed": True, "changed": changed,
             return {"failed": True, "changed": changed,
-                    "msg": "Docker API query missing info:\n{}".format(json.dumps(info))}
-        info = info["info"]
+                    "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}
+        docker_info = docker_info["info"]
 
 
         # check if the storage driver we saw is valid
         # check if the storage driver we saw is valid
-        driver = info.get("Driver", "[NONE]")
+        driver = docker_info.get("Driver", "[NONE]")
         if driver not in self.storage_drivers:
         if driver not in self.storage_drivers:
             msg = (
             msg = (
                 "Detected unsupported Docker storage driver '{driver}'.\n"
                 "Detected unsupported Docker storage driver '{driver}'.\n"
@@ -53,26 +71,34 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
             return {"failed": True, "changed": changed, "msg": msg}
             return {"failed": True, "changed": changed, "msg": msg}
 
 
         # driver status info is a list of tuples; convert to dict and validate based on driver
         # driver status info is a list of tuples; convert to dict and validate based on driver
-        driver_status = {item[0]: item[1] for item in info.get("DriverStatus", [])}
+        driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])}
+
+        result = {}
+
         if driver == "devicemapper":
         if driver == "devicemapper":
-            if driver_status.get("Data loop file"):
-                msg = (
-                    "Use of loopback devices with the Docker devicemapper storage driver\n"
-                    "(the default storage configuration) is unsupported in production.\n"
-                    "Please use docker-storage-setup to configure a backing storage volume.\n"
-                    "See http://red.ht/2rNperO for further information."
-                )
-                return {"failed": True, "changed": changed, "msg": msg}
-            result = self._check_dm_usage(driver_status, task_vars)
-            result['changed'] = result.get('changed', False) or changed
-            return result
+            result = self.check_devicemapper_support(driver_status, task_vars)
 
 
-        # TODO(lmeyer): determine how to check usage for overlay2
+        if driver in ['overlay', 'overlay2']:
+            result = self.check_overlay_support(docker_info, driver_status, task_vars)
 
 
-        return {"changed": changed}
+        result['changed'] = result.get('changed', False) or changed
+        return result
 
 
-    def _check_dm_usage(self, driver_status, task_vars):
-        """
+    def check_devicemapper_support(self, driver_status, task_vars):
+        """Check if dm storage driver is supported as configured. Return: result dict."""
+        if driver_status.get("Data loop file"):
+            msg = (
+                "Use of loopback devices with the Docker devicemapper storage driver\n"
+                "(the default storage configuration) is unsupported in production.\n"
+                "Please use docker-storage-setup to configure a backing storage volume.\n"
+                "See http://red.ht/2rNperO for further information."
+            )
+            return {"failed": True, "msg": msg}
+        result = self.check_dm_usage(driver_status, task_vars)
+        return result
+
+    def check_dm_usage(self, driver_status, task_vars):
+        """Check usage thresholds for Docker dm storage driver. Return: result dict.
         Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
         Backing assumptions: We expect devicemapper to be backed by an auto-expanding thin pool
         implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
         implemented as an LV in an LVM2 VG. This is how docker-storage-setup currently configures
         devicemapper storage. The LV is "thin" because it does not use all available storage
         devicemapper storage. The LV is "thin" because it does not use all available storage
@@ -83,7 +109,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         could run out of space first; so we check both.
         could run out of space first; so we check both.
         """
         """
         vals = dict(
         vals = dict(
-            vg_free=self._get_vg_free(driver_status.get("Pool Name"), task_vars),
+            vg_free=self.get_vg_free(driver_status.get("Pool Name"), task_vars),
             data_used=driver_status.get("Data Space Used"),
             data_used=driver_status.get("Data Space Used"),
             data_total=driver_status.get("Data Space Total"),
             data_total=driver_status.get("Data Space Total"),
             metadata_used=driver_status.get("Metadata Space Used"),
             metadata_used=driver_status.get("Metadata Space Used"),
@@ -93,7 +119,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         # convert all human-readable strings to bytes
         # convert all human-readable strings to bytes
         for key, value in vals.copy().items():
         for key, value in vals.copy().items():
             try:
             try:
-                vals[key + "_bytes"] = self._convert_to_bytes(value)
+                vals[key + "_bytes"] = self.convert_to_bytes(value)
             except ValueError as err:  # unlikely to hit this from API info, but just to be safe
             except ValueError as err:  # unlikely to hit this from API info, but just to be safe
                 return {
                 return {
                     "failed": True,
                     "failed": True,
@@ -131,10 +157,12 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
         vals["msg"] = "\n".join(messages or ["Thinpool usage is within thresholds."])
         return vals
         return vals
 
 
-    def _get_vg_free(self, pool, task_vars):
-        # Determine which VG to examine according to the pool name, the only indicator currently
-        # available from the Docker API driver info. We assume a name that looks like
-        # "vg--name-docker--pool"; vg and lv names with inner hyphens doubled, joined by a hyphen.
+    def get_vg_free(self, pool, task_vars):
+        """Determine which VG to examine according to the pool name. Return: size vgs reports.
+        Pool name is the only indicator currently available from the Docker API driver info.
+        We assume a name that looks like "vg--name-docker--pool";
+        vg and lv names with inner hyphens doubled, joined by a hyphen.
+        """
         match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen
         match = re.match(r'((?:[^-]|--)+)-(?!-)', pool)  # matches up to the first single hyphen
         if not match:  # unlikely, but... be clear if we assumed wrong
         if not match:  # unlikely, but... be clear if we assumed wrong
             raise OpenShiftCheckException(
             raise OpenShiftCheckException(
@@ -163,7 +191,8 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
         return size
         return size
 
 
     @staticmethod
     @staticmethod
-    def _convert_to_bytes(string):
+    def convert_to_bytes(string):
+        """Convert string like "10.3 G" to bytes (binary units assumed). Return: float bytes."""
         units = dict(
         units = dict(
             b=1,
             b=1,
             k=1024,
             k=1024,
@@ -183,3 +212,87 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):
             raise ValueError("Cannot convert to a byte size: " + string)
             raise ValueError("Cannot convert to a byte size: " + string)
 
 
         return float(number) * multiplier
         return float(number) * multiplier
+
+    def check_overlay_support(self, docker_info, driver_status, task_vars):
+        """Check if overlay storage driver is supported for this host. Return: result dict."""
+        # check for xfs as backing store
+        backing_fs = driver_status.get("Backing Filesystem", "[NONE]")
+        if backing_fs != "xfs":
+            msg = (
+                "Docker storage drivers 'overlay' and 'overlay2' are only supported with\n"
+                "'xfs' as the backing storage, but this host's storage is type '{fs}'."
+            ).format(fs=backing_fs)
+            return {"failed": True, "msg": msg}
+
+        # check support for OS and kernel version
+        o_s = docker_info.get("OperatingSystem", "[NONE]")
+        if "Red Hat Enterprise Linux" in o_s or "CentOS" in o_s:
+            # keep it simple, only check enterprise kernel versions; assume everyone else is good
+            kernel = docker_info.get("KernelVersion", "[NONE]")
+            kernel_arr = [int(num) for num in re.findall(r'\d+', kernel)]
+            if kernel_arr < [3, 10, 0, 514]:  # rhel < 7.3
+                msg = (
+                    "Docker storage drivers 'overlay' and 'overlay2' are only supported beginning with\n"
+                    "kernel version 3.10.0-514; but Docker reports kernel version {version}."
+                ).format(version=kernel)
+                return {"failed": True, "msg": msg}
+            # NOTE: we could check for --selinux-enabled here but docker won't even start with
+            # that option until it's supported in the kernel so we don't need to.
+
+        return self.check_overlay_usage(docker_info, task_vars)
+
+    def check_overlay_usage(self, docker_info, task_vars):
+        """Check disk usage on OverlayFS backing store volume. Return: result dict."""
+        path = docker_info.get("DockerRootDir", "/var/lib/docker") + "/" + docker_info["Driver"]
+
+        threshold = get_var(task_vars, "max_overlay_usage_percent", default=self.max_overlay_usage_percent)
+        try:
+            threshold = float(threshold)
+        except ValueError:
+            return {
+                "failed": True,
+                "msg": "Specified 'max_overlay_usage_percent' is not a percentage: {}".format(threshold),
+            }
+
+        mount = self.find_ansible_mount(path, get_var(task_vars, "ansible_mounts"))
+        try:
+            free_bytes = mount['size_available']
+            total_bytes = mount['size_total']
+            usage = 100.0 * (total_bytes - free_bytes) / total_bytes
+        except (KeyError, ZeroDivisionError):
+            return {
+                "failed": True,
+                "msg": "The ansible_mount found for path {} is invalid.\n"
+                       "This is likely to be an Ansible bug. The record was:\n"
+                       "{}".format(path, json.dumps(mount, indent=2)),
+            }
+
+        if usage > threshold:
+            return {
+                "failed": True,
+                "msg": (
+                    "For Docker OverlayFS mount point {path},\n"
+                    "usage percentage {pct:.1f} is higher than threshold {thresh:.1f}."
+                ).format(path=mount["mount"], pct=usage, thresh=threshold)
+            }
+
+        return {}
+
+    # TODO(lmeyer): migrate to base class
+    @staticmethod
+    def find_ansible_mount(path, ansible_mounts):
+        """Return the mount point for path from ansible_mounts."""
+
+        mount_for_path = {mount['mount']: mount for mount in ansible_mounts}
+        mount_point = path
+        while mount_point not in mount_for_path:
+            if mount_point in ["/", ""]:  # "/" not in ansible_mounts???
+                break
+            mount_point = os.path.dirname(mount_point)
+
+        try:
+            return mount_for_path[mount_point]
+        except KeyError:
+            known_mounts = ', '.join('"{}"'.format(mount) for mount in sorted(mount_for_path)) or 'none'
+            msg = 'Unable to determine mount point for path "{}". Known mount points: {}.'
+            raise OpenShiftCheckException(msg.format(path, known_mounts))

+ 97 - 10
roles/openshift_health_checker/test/docker_storage_test.py

@@ -23,7 +23,8 @@ def test_is_active(is_containerized, group_names, is_active):
     assert DockerStorage.is_active(task_vars=task_vars) == is_active
     assert DockerStorage.is_active(task_vars=task_vars) == is_active
 
 
 
 
-non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
+def non_atomic_task_vars():
+    return {"openshift": {"common": {"is_atomic": False}}}
 
 
 
 
 @pytest.mark.parametrize('docker_info, failed, expect_msg', [
 @pytest.mark.parametrize('docker_info, failed, expect_msg', [
@@ -56,7 +57,7 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
     (
     (
         dict(info={
         dict(info={
             "Driver": "overlay2",
             "Driver": "overlay2",
-            "DriverStatus": []
+            "DriverStatus": [("Backing Filesystem", "xfs")],
         }),
         }),
         False,
         False,
         [],
         [],
@@ -64,6 +65,27 @@ non_atomic_task_vars = {"openshift": {"common": {"is_atomic": False}}}
     (
     (
         dict(info={
         dict(info={
             "Driver": "overlay",
             "Driver": "overlay",
+            "DriverStatus": [("Backing Filesystem", "btrfs")],
+        }),
+        True,
+        ["storage is type 'btrfs'", "only supported with\n'xfs'"],
+    ),
+    (
+        dict(info={
+            "Driver": "overlay2",
+            "DriverStatus": [("Backing Filesystem", "xfs")],
+            "OperatingSystem": "Red Hat Enterprise Linux Server release 7.2 (Maipo)",
+            "KernelVersion": "3.10.0-327.22.2.el7.x86_64",
+        }),
+        True,
+        ["Docker reports kernel version 3.10.0-327"],
+    ),
+    (
+        dict(info={
+            "Driver": "overlay",
+            "DriverStatus": [("Backing Filesystem", "xfs")],
+            "OperatingSystem": "CentOS",
+            "KernelVersion": "3.10.0-514",
         }),
         }),
         False,
         False,
         [],
         [],
@@ -85,8 +107,9 @@ def test_check_storage_driver(docker_info, failed, expect_msg):
         return docker_info
         return docker_info
 
 
     check = dummy_check(execute_module=execute_module)
     check = dummy_check(execute_module=execute_module)
-    check._check_dm_usage = lambda status, task_vars: dict()  # stub out for this test
-    result = check.run(tmp=None, task_vars=non_atomic_task_vars)
+    check.check_dm_usage = lambda status, task_vars: dict()  # stub out for this test
+    check.check_overlay_usage = lambda info, task_vars: dict()  # stub out for this test
+    result = check.run(tmp=None, task_vars=non_atomic_task_vars())
 
 
     if failed:
     if failed:
         assert result["failed"]
         assert result["failed"]
@@ -146,8 +169,8 @@ not_enough_space = {
 ])
 ])
 def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
 def test_dm_usage(task_vars, driver_status, vg_free, success, expect_msg):
     check = dummy_check()
     check = dummy_check()
-    check._get_vg_free = lambda pool, task_vars: vg_free
-    result = check._check_dm_usage(driver_status, task_vars)
+    check.get_vg_free = lambda pool, task_vars: vg_free
+    result = check.check_dm_usage(driver_status, task_vars)
     result_success = not result.get("failed")
     result_success = not result.get("failed")
 
 
     assert result_success is success
     assert result_success is success
@@ -195,10 +218,10 @@ def test_vg_free(pool, command_returns, raises, returns):
     check = dummy_check(execute_module=execute_module)
     check = dummy_check(execute_module=execute_module)
     if raises:
     if raises:
         with pytest.raises(OpenShiftCheckException) as err:
         with pytest.raises(OpenShiftCheckException) as err:
-            check._get_vg_free(pool, {})
+            check.get_vg_free(pool, {})
         assert raises in str(err.value)
         assert raises in str(err.value)
     else:
     else:
-        ret = check._get_vg_free(pool, {})
+        ret = check.get_vg_free(pool, {})
         assert ret == returns
         assert ret == returns
 
 
 
 
@@ -209,7 +232,7 @@ def test_vg_free(pool, command_returns, raises, returns):
     ("12g", 12.0 * 1024**3),
     ("12g", 12.0 * 1024**3),
 ])
 ])
 def test_convert_to_bytes(string, expect_bytes):
 def test_convert_to_bytes(string, expect_bytes):
-    got = DockerStorage._convert_to_bytes(string)
+    got = DockerStorage.convert_to_bytes(string)
     assert got == expect_bytes
     assert got == expect_bytes
 
 
 
 
@@ -219,6 +242,70 @@ def test_convert_to_bytes(string, expect_bytes):
 ])
 ])
 def test_convert_to_bytes_error(string):
 def test_convert_to_bytes_error(string):
     with pytest.raises(ValueError) as err:
     with pytest.raises(ValueError) as err:
-        DockerStorage._convert_to_bytes(string)
+        DockerStorage.convert_to_bytes(string)
     assert "Cannot convert" in str(err.value)
     assert "Cannot convert" in str(err.value)
     assert string in str(err.value)
     assert string in str(err.value)
+
+
+ansible_mounts_enough = [{
+    'mount': '/var/lib/docker',
+    'size_available': 50 * 10**9,
+    'size_total': 50 * 10**9,
+}]
+ansible_mounts_not_enough = [{
+    'mount': '/var/lib/docker',
+    'size_available': 0,
+    'size_total': 50 * 10**9,
+}]
+ansible_mounts_missing_fields = [dict(mount='/var/lib/docker')]
+ansible_mounts_zero_size = [{
+    'mount': '/var/lib/docker',
+    'size_available': 0,
+    'size_total': 0,
+}]
+
+
+@pytest.mark.parametrize('ansible_mounts, threshold, expect_fail, expect_msg', [
+    (
+        ansible_mounts_enough,
+        None,
+        False,
+        [],
+    ),
+    (
+        ansible_mounts_not_enough,
+        None,
+        True,
+        ["usage percentage", "higher than threshold"],
+    ),
+    (
+        ansible_mounts_not_enough,
+        "bogus percent",
+        True,
+        ["is not a percentage"],
+    ),
+    (
+        ansible_mounts_missing_fields,
+        None,
+        True,
+        ["Ansible bug"],
+    ),
+    (
+        ansible_mounts_zero_size,
+        None,
+        True,
+        ["Ansible bug"],
+    ),
+])
+def test_overlay_usage(ansible_mounts, threshold, expect_fail, expect_msg):
+    check = dummy_check()
+    task_vars = non_atomic_task_vars()
+    task_vars["ansible_mounts"] = ansible_mounts
+    if threshold is not None:
+        task_vars["max_overlay_usage_percent"] = threshold
+    docker_info = dict(DockerRootDir="/var/lib/docker", Driver="overlay")
+    result = check.check_overlay_usage(docker_info, task_vars)
+
+    assert expect_fail == bool(result.get("failed"))
+    for msg in expect_msg:
+        assert msg in result["msg"]