Ver código fonte

add etcd cluster size check

juanvallejo 8 anos atrás
pai
commit
064bc167e3

+ 122 - 0
roles/openshift_health_checker/library/etcdkeysize.py

@@ -0,0 +1,122 @@
+#!/usr/bin/python
+"""Ansible module that recursively determines if the size of a key in an etcd cluster exceeds a given limit."""
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+try:
+    import etcd
+
+    IMPORT_EXCEPTION_MSG = None
+except ImportError as err:
+    IMPORT_EXCEPTION_MSG = str(err)
+
+    from collections import namedtuple
+    EtcdMock = namedtuple("etcd", ["EtcdKeyNotFound"])
+    etcd = EtcdMock(KeyError)
+
+
+# pylint: disable=too-many-arguments
+def check_etcd_key_size(client, key, size_limit, total_size=0, depth=0, depth_limit=1000, visited=None):
+    """Check size of an etcd path starting at given key. Returns tuple (string, bool)"""
+    if visited is None:
+        visited = set()
+
+    if key in visited:
+        return 0, False
+
+    visited.add(key)
+
+    try:
+        result = client.read(key, recursive=False)
+    except etcd.EtcdKeyNotFound:
+        return 0, False
+
+    size = 0
+    limit_exceeded = False
+
+    for node in result.leaves:
+        if depth >= depth_limit:
+            raise Exception("Maximum recursive stack depth ({}) exceeded.".format(depth_limit))
+
+        if size_limit and total_size + size > size_limit:
+            return size, True
+
+        if not node.dir:
+            size += len(node.value)
+            continue
+
+        key_size, limit_exceeded = check_etcd_key_size(client, node.key,
+                                                       size_limit,
+                                                       total_size + size,
+                                                       depth + 1,
+                                                       depth_limit, visited)
+        size += key_size
+
+    max_limit_exceeded = limit_exceeded or (total_size + size > size_limit)
+    return size, max_limit_exceeded
+
+
+def main():  # pylint: disable=missing-docstring,too-many-branches
+    module = AnsibleModule(
+        argument_spec=dict(
+            size_limit_bytes=dict(type="int", default=0),
+            paths=dict(type="list", default=["/openshift.io/images"]),
+            host=dict(type="str", default="127.0.0.1"),
+            port=dict(type="int", default=4001),
+            protocol=dict(type="str", default="http"),
+            version_prefix=dict(type="str", default=""),
+            allow_redirect=dict(type="bool", default=False),
+            cert=dict(type="dict", default=""),
+            ca_cert=dict(type="str", default=None),
+        ),
+        supports_check_mode=True
+    )
+
+    module.params["cert"] = (
+        module.params["cert"]["cert"],
+        module.params["cert"]["key"],
+    )
+
+    size_limit = module.params.pop("size_limit_bytes")
+    paths = module.params.pop("paths")
+
+    limit_exceeded = False
+
+    try:
+        # pylint: disable=no-member
+        client = etcd.Client(**module.params)
+    except AttributeError as attrerr:
+        msg = str(attrerr)
+        if IMPORT_EXCEPTION_MSG:
+            msg = IMPORT_EXCEPTION_MSG
+            if "No module named etcd" in IMPORT_EXCEPTION_MSG:
+                # pylint: disable=redefined-variable-type
+                msg = ('Unable to import the python "etcd" dependency. '
+                       'Make sure python-etcd is installed on the host.')
+
+        module.exit_json(
+            failed=True,
+            changed=False,
+            size_limit_exceeded=limit_exceeded,
+            msg=msg,
+        )
+
+        return
+
+    size = 0
+    for path in paths:
+        path_size, limit_exceeded = check_etcd_key_size(client, path, size_limit - size)
+        size += path_size
+
+        if limit_exceeded:
+            break
+
+    module.exit_json(
+        changed=False,
+        size_limit_exceeded=limit_exceeded,
+    )
+
+
+if __name__ == '__main__':
+    main()

+ 84 - 0
roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py

@@ -0,0 +1,84 @@
+"""
+Ansible module for determining if the size of OpenShift image data exceeds a specified limit in an etcd cluster.
+"""
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var
+
+
+class EtcdImageDataSize(OpenShiftCheck):
+    """Check that total size of OpenShift image data does not exceed the recommended limit in an etcd cluster"""
+
+    name = "etcd_imagedata_size"
+    tags = ["etcd"]
+
+    def run(self, tmp, task_vars):
+        etcd_mountpath = self._get_etcd_mountpath(get_var(task_vars, "ansible_mounts"))
+        etcd_avail_diskspace = etcd_mountpath["size_available"]
+        etcd_total_diskspace = etcd_mountpath["size_total"]
+
+        etcd_imagedata_size_limit = get_var(task_vars,
+                                            "etcd_max_image_data_size_bytes",
+                                            default=int(0.5 * float(etcd_total_diskspace - etcd_avail_diskspace)))
+
+        etcd_is_ssl = get_var(task_vars, "openshift", "master", "etcd_use_ssl", default=False)
+        etcd_port = get_var(task_vars, "openshift", "master", "etcd_port", default=2379)
+        etcd_hosts = get_var(task_vars, "openshift", "master", "etcd_hosts")
+
+        config_base = get_var(task_vars, "openshift", "common", "config_base")
+
+        cert = task_vars.get("etcd_client_cert", config_base + "/master/master.etcd-client.crt")
+        key = task_vars.get("etcd_client_key", config_base + "/master/master.etcd-client.key")
+        ca_cert = task_vars.get("etcd_client_ca_cert", config_base + "/master/master.etcd-ca.crt")
+
+        for etcd_host in list(etcd_hosts):
+            args = {
+                "size_limit_bytes": etcd_imagedata_size_limit,
+                "paths": ["/openshift.io/images", "/openshift.io/imagestreams"],
+                "host": etcd_host,
+                "port": etcd_port,
+                "protocol": "https" if etcd_is_ssl else "http",
+                "version_prefix": "/v2",
+                "allow_redirect": True,
+                "ca_cert": ca_cert,
+                "cert": {
+                    "cert": cert,
+                    "key": key,
+                },
+            }
+
+            etcdkeysize = self.module_executor("etcdkeysize", args, task_vars)
+
+            if etcdkeysize.get("rc", 0) != 0 or etcdkeysize.get("failed"):
+                msg = 'Failed to retrieve stats for etcd host "{host}": {reason}'
+                reason = etcdkeysize.get("msg")
+                if etcdkeysize.get("module_stderr"):
+                    reason = etcdkeysize["module_stderr"]
+
+                msg = msg.format(host=etcd_host, reason=reason)
+                return {"failed": True, "changed": False, "msg": msg}
+
+            if etcdkeysize["size_limit_exceeded"]:
+                limit = self._to_gigabytes(etcd_imagedata_size_limit)
+                msg = ("The size of OpenShift image data stored in etcd host "
+                       "\"{host}\" exceeds the maximum recommended limit of {limit:.2f} GB. "
+                       "Use the `oadm prune images` command to cleanup unused Docker images.")
+                return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)}
+
+        return {"changed": False}
+
+    @staticmethod
+    def _get_etcd_mountpath(ansible_mounts):
+        valid_etcd_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"]
+
+        mount_for_path = {mnt.get("mount"): mnt for mnt in ansible_mounts}
+        for path in valid_etcd_mount_paths:
+            if path in mount_for_path:
+                return mount_for_path[path]
+
+        paths = ', '.join(sorted(mount_for_path)) or 'none'
+        msg = "Unable to determine a valid etcd mountpath. Paths mounted: {}.".format(paths)
+        raise OpenShiftCheckException(msg)
+
+    @staticmethod
+    def _to_gigabytes(byte_size):
+        return float(byte_size) / 10.0**9

+ 328 - 0
roles/openshift_health_checker/test/etcd_imagedata_size_test.py

@@ -0,0 +1,328 @@
+import pytest
+
+from collections import namedtuple
+from openshift_checks.etcd_imagedata_size import EtcdImageDataSize, OpenShiftCheckException
+from etcdkeysize import check_etcd_key_size
+
+
+def fake_etcd_client(root):
+    fake_nodes = dict()
+    fake_etcd_node(root, fake_nodes)
+
+    clientclass = namedtuple("client", ["read"])
+    return clientclass(lambda key, recursive: fake_etcd_result(fake_nodes[key]))
+
+
+def fake_etcd_result(fake_node):
+    resultclass = namedtuple("result", ["leaves"])
+    if not fake_node.dir:
+        return resultclass([fake_node])
+
+    return resultclass(fake_node.leaves)
+
+
+def fake_etcd_node(node, visited):
+    min_req_fields = ["dir", "key"]
+    fields = list(node)
+    leaves = []
+
+    if node["dir"] and node.get("leaves"):
+        for leaf in node["leaves"]:
+            leaves.append(fake_etcd_node(leaf, visited))
+
+    if len(set(min_req_fields) - set(fields)) > 0:
+        raise ValueError("fake etcd nodes require at least {} fields.".format(min_req_fields))
+
+    if node.get("leaves"):
+        node["leaves"] = leaves
+
+    nodeclass = namedtuple("node", fields)
+    nodeinst = nodeclass(**node)
+    visited[nodeinst.key] = nodeinst
+
+    return nodeinst
+
+
+@pytest.mark.parametrize('ansible_mounts,extra_words', [
+    ([], ['none']),  # empty ansible_mounts
+    ([{'mount': '/mnt'}], ['/mnt']),  # missing relevant mount paths
+])
+def test_cannot_determine_available_mountpath(ansible_mounts, extra_words):
+    task_vars = dict(
+        ansible_mounts=ansible_mounts,
+    )
+    check = EtcdImageDataSize(execute_module=fake_execute_module)
+
+    with pytest.raises(OpenShiftCheckException) as excinfo:
+        check.run(tmp=None, task_vars=task_vars)
+
+    for word in 'determine valid etcd mountpath'.split() + extra_words:
+        assert word in str(excinfo.value)
+
+
+@pytest.mark.parametrize('ansible_mounts,tree,size_limit,should_fail,extra_words', [
+    (
+        # test that default image size limit evals to 1/2 * (total size in use)
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 80 * 10**9,
+        }],
+        {"dir": False, "key": "/", "value": "1234"},
+        None,
+        False,
+        [],
+    ),
+    (
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 48 * 10**9,
+        }],
+        {"dir": False, "key": "/", "value": "1234"},
+        None,
+        False,
+        [],
+    ),
+    (
+        # set max size limit for image data to be below total node value
+        # total node value is defined as the sum of the value field
+        # from every node
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 48 * 10**9,
+        }],
+        {"dir": False, "key": "/", "value": "12345678"},
+        7,
+        True,
+        ["exceeds the maximum recommended limit", "0.00 GB"],
+    ),
+    (
+        [{
+            'mount': '/',
+            'size_available': 48 * 10**9 - 1,
+            'size_total': 48 * 10**9,
+        }],
+        {"dir": False, "key": "/", "value": "1234"},
+        None,
+        True,
+        ["exceeds the maximum recommended limit", "0.00 GB"],
+    )
+])
+def test_check_etcd_key_size_calculates_correct_limit(ansible_mounts, tree, size_limit, should_fail, extra_words):
+    def execute_module(module_name, args, tmp=None, task_vars=None):
+        if module_name != "etcdkeysize":
+            return {
+                "changed": False,
+            }
+
+        client = fake_etcd_client(tree)
+        s, limit_exceeded = check_etcd_key_size(client, tree["key"], args["size_limit_bytes"])
+
+        return {"size_limit_exceeded": limit_exceeded}
+
+    task_vars = dict(
+        etcd_max_image_data_size_bytes=size_limit,
+        ansible_mounts=ansible_mounts,
+        openshift=dict(
+            master=dict(etcd_hosts=["localhost"]),
+            common=dict(config_base="/var/lib/origin")
+        )
+    )
+    if size_limit is None:
+        task_vars.pop("etcd_max_image_data_size_bytes")
+
+    check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+
+    if should_fail:
+        assert check["failed"]
+
+        for word in extra_words:
+            assert word in check["msg"]
+    else:
+        assert not check.get("failed", False)
+
+
+@pytest.mark.parametrize('ansible_mounts,tree,root_path,expected_size,extra_words', [
+    (
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 80 * 10**9,
+        }],
+        # test recursive size check on tree with height > 1
+        {
+            "dir": True,
+            "key": "/",
+            "leaves": [
+                {"dir": False, "key": "/foo1", "value": "1234"},
+                {"dir": False, "key": "/foo2", "value": "1234"},
+                {"dir": False, "key": "/foo3", "value": "1234"},
+                {"dir": False, "key": "/foo4", "value": "1234"},
+                {
+                    "dir": True,
+                    "key": "/foo5",
+                    "leaves": [
+                        {"dir": False, "key": "/foo/bar1", "value": "56789"},
+                        {"dir": False, "key": "/foo/bar2", "value": "56789"},
+                        {"dir": False, "key": "/foo/bar3", "value": "56789"},
+                        {
+                            "dir": True,
+                            "key": "/foo/bar4",
+                            "leaves": [
+                                {"dir": False, "key": "/foo/bar/baz1", "value": "123"},
+                                {"dir": False, "key": "/foo/bar/baz2", "value": "123"},
+                            ]
+                        },
+                    ]
+                },
+            ]
+        },
+        "/",
+        37,
+        [],
+    ),
+    (
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 80 * 10**9,
+        }],
+        # test correct sub-tree size calculation
+        {
+            "dir": True,
+            "key": "/",
+            "leaves": [
+                {"dir": False, "key": "/foo1", "value": "1234"},
+                {"dir": False, "key": "/foo2", "value": "1234"},
+                {"dir": False, "key": "/foo3", "value": "1234"},
+                {"dir": False, "key": "/foo4", "value": "1234"},
+                {
+                    "dir": True,
+                    "key": "/foo5",
+                    "leaves": [
+                        {"dir": False, "key": "/foo/bar1", "value": "56789"},
+                        {"dir": False, "key": "/foo/bar2", "value": "56789"},
+                        {"dir": False, "key": "/foo/bar3", "value": "56789"},
+                        {
+                            "dir": True,
+                            "key": "/foo/bar4",
+                            "leaves": [
+                                {"dir": False, "key": "/foo/bar/baz1", "value": "123"},
+                                {"dir": False, "key": "/foo/bar/baz2", "value": "123"},
+                            ]
+                        },
+                    ]
+                },
+            ]
+        },
+        "/foo5",
+        21,
+        [],
+    ),
+    (
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 80 * 10**9,
+        }],
+        # test that a non-existing key is handled correctly
+        {
+            "dir": False,
+            "key": "/",
+            "value": "1234",
+        },
+        "/missing",
+        0,
+        [],
+    ),
+    (
+        [{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 80 * 10**9,
+        }],
+        # test etcd cycle handling
+        {
+            "dir": True,
+            "key": "/",
+            "leaves": [
+                {"dir": False, "key": "/foo1", "value": "1234"},
+                {"dir": False, "key": "/foo2", "value": "1234"},
+                {"dir": False, "key": "/foo3", "value": "1234"},
+                {"dir": False, "key": "/foo4", "value": "1234"},
+                {
+                    "dir": True,
+                    "key": "/",
+                    "leaves": [
+                        {"dir": False, "key": "/foo1", "value": "1"},
+                    ],
+                },
+            ]
+        },
+        "/",
+        16,
+        [],
+    ),
+])
+def test_etcd_key_size_check_calculates_correct_size(ansible_mounts, tree, root_path, expected_size, extra_words):
+    def execute_module(module_name, args, tmp=None, task_vars=None):
+        if module_name != "etcdkeysize":
+            return {
+                "changed": False,
+            }
+
+        client = fake_etcd_client(tree)
+        size, limit_exceeded = check_etcd_key_size(client, root_path, args["size_limit_bytes"])
+
+        assert size == expected_size
+        return {
+            "size_limit_exceeded": limit_exceeded,
+        }
+
+    task_vars = dict(
+        ansible_mounts=ansible_mounts,
+        openshift=dict(
+            master=dict(etcd_hosts=["localhost"]),
+            common=dict(config_base="/var/lib/origin")
+        )
+    )
+
+    check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+    assert not check.get("failed", False)
+
+
+def test_etcdkeysize_module_failure():
+    def execute_module(module_name, tmp=None, task_vars=None):
+        if module_name != "etcdkeysize":
+            return {
+                "changed": False,
+            }
+
+        return {
+            "rc": 1,
+            "module_stderr": "failure",
+        }
+
+    task_vars = dict(
+        ansible_mounts=[{
+            'mount': '/',
+            'size_available': 40 * 10**9,
+            'size_total': 80 * 10**9,
+        }],
+        openshift=dict(
+            master=dict(etcd_hosts=["localhost"]),
+            common=dict(config_base="/var/lib/origin")
+        )
+    )
+
+    check = EtcdImageDataSize(execute_module=execute_module).run(tmp=None, task_vars=task_vars)
+
+    assert check["failed"]
+    for word in "Failed to retrieve stats":
+        assert word in check["msg"]
+
+
+def fake_execute_module(*args):
+    raise AssertionError('this function should not be called')