Browse Source

Merge pull request #4161 from rhcarvalho/integrate-checks-with-install

Verify memory and disk requirements before install
Scott Dodson 7 years ago
parent
commit
e8c8ed5404

+ 13 - 0
playbooks/byo/openshift-cluster/config.yml

@@ -3,6 +3,19 @@
   tags:
   - always
 
+- name: Verify Requirements
+  hosts: OSEv3
+  roles:
+  - openshift_health_checker
+  vars:
+  - r_openshift_health_checker_playbook_context: "install"
+  post_tasks:
+  - action: openshift_health_check
+    args:
+      checks:
+      - disk_availability
+      - memory_availability
+
 - include: ../../common/openshift-cluster/std_include.yml
   tags:
   - always

+ 6 - 5
playbooks/common/openshift-checks/health.yml

@@ -2,9 +2,10 @@
 - name: Run OpenShift health checks
   hosts: OSEv3
   roles:
-    - openshift_health_checker
+  - openshift_health_checker
+  vars:
+  - r_openshift_health_checker_playbook_context: "health"
   post_tasks:
-    - action: openshift_health_check  # https://github.com/ansible/ansible/issues/20513
-      args:
-        checks:
-          - '@health'
+  - action: openshift_health_check  # https://github.com/ansible/ansible/issues/20513
+    args:
+      checks: ['@health']

+ 6 - 5
playbooks/common/openshift-checks/pre-install.yml

@@ -2,9 +2,10 @@
 - hosts: OSEv3
   name: run OpenShift pre-install checks
   roles:
-    - openshift_health_checker
+  - openshift_health_checker
+  vars:
+  - r_openshift_health_checker_playbook_context: "pre-install"
   post_tasks:
-    - action: openshift_health_check  # https://github.com/ansible/ansible/issues/20513
-      args:
-        checks:
-          - '@preflight'
+  - action: openshift_health_check  # https://github.com/ansible/ansible/issues/20513
+    args:
+      checks: ['@preflight']

+ 18 - 8
roles/openshift_health_checker/action_plugins/openshift_health_check.py

@@ -25,9 +25,11 @@ class ActionModule(ActionBase):
 
     def run(self, tmp=None, task_vars=None):
         result = super(ActionModule, self).run(tmp, task_vars)
+        task_vars = task_vars or {}
 
-        if task_vars is None:
-            task_vars = {}
+        # vars are not supportably available in the callback plugin,
+        # so record any it will need in the result.
+        result['playbook_context'] = task_vars.get('r_openshift_health_checker_playbook_context')
 
         if "openshift" not in task_vars:
             result["failed"] = True
@@ -46,19 +48,27 @@ class ActionModule(ActionBase):
 
         result["checks"] = check_results = {}
 
+        user_disabled_checks = [
+            check.strip()
+            for check in task_vars.get("openshift_disable_check", "").split(",")
+        ]
+
         for check_name in resolved_checks:
             display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"]))
             check = known_checks[check_name]
 
-            if check.is_active(task_vars):
+            if not check.is_active(task_vars):
+                r = dict(skipped=True, skipped_reason="Not active for this host")
+            elif check_name in user_disabled_checks:
+                r = dict(skipped=True, skipped_reason="Disabled by user request")
+            else:
                 try:
                     r = check.run(tmp, task_vars)
                 except OpenShiftCheckException as e:
-                    r = {}
-                    r["failed"] = True
-                    r["msg"] = str(e)
-            else:
-                r = {"skipped": True}
+                    r = dict(
+                        failed=True,
+                        msg=str(e),
+                    )
 
             check_results[check_name] = r
 

+ 65 - 22
roles/openshift_health_checker/callback_plugins/zz_failure_summary.py

@@ -2,6 +2,12 @@
 Ansible callback plugin.
 '''
 
+# Reason: In several locations below we disable pylint protected-access
+#         for Ansible objects that do not give us any public way
+#         to access the full details we need to report check failures.
+# Status: disabled permanently or until Ansible object has a public API.
+# This does leave the code more likely to be broken by future Ansible changes.
+
 from pprint import pformat
 
 from ansible.plugins.callback import CallbackBase
@@ -20,38 +26,37 @@ class CallbackModule(CallbackBase):
     CALLBACK_TYPE = 'aggregate'
     CALLBACK_NAME = 'failure_summary'
     CALLBACK_NEEDS_WHITELIST = False
+    _playbook_file = None
 
     def __init__(self):
         super(CallbackModule, self).__init__()
         self.__failures = []
 
+    def v2_playbook_on_start(self, playbook):
+        super(CallbackModule, self).v2_playbook_on_start(playbook)
+        # re: playbook attrs see top comment  # pylint: disable=protected-access
+        self._playbook_file = playbook._file_name
+
     def v2_runner_on_failed(self, result, ignore_errors=False):
         super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
         self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
 
     def v2_playbook_on_stats(self, stats):
         super(CallbackModule, self).v2_playbook_on_stats(stats)
-        # TODO: update condition to consider a host var or env var to
-        # enable/disable the summary, so that we can control the output from a
-        # play.
         if self.__failures:
-            self._print_failure_summary()
+            self._print_failure_details(self.__failures)
 
-    def _print_failure_summary(self):
-        '''Print a summary of failed tasks (including ignored failures).'''
+    def _print_failure_details(self, failures):
+        '''Print a summary of failed tasks or checks.'''
         self._display.display(u'\nFailure summary:\n')
 
-        # TODO: group failures by host or by task. If grouped by host, it is
-        # easy to see all problems of a given host. If grouped by task, it is
-        # easy to see what hosts needs the same fix.
-
-        width = len(str(len(self.__failures)))
+        width = len(str(len(failures)))
         initial_indent_format = u'  {{:>{width}}}. '.format(width=width)
         initial_indent_len = len(initial_indent_format.format(0))
         subsequent_indent = u' ' * initial_indent_len
         subsequent_extra_indent = u' ' * (initial_indent_len + 10)
 
-        for i, failure in enumerate(self.__failures, 1):
+        for i, failure in enumerate(failures, 1):
             entries = _format_failure(failure)
             self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), entries[0]))
             for entry in entries[1:]:
@@ -59,11 +64,52 @@ class CallbackModule(CallbackBase):
                 indented = u'{}{}'.format(subsequent_indent, entry)
                 self._display.display(indented)
 
-
-# Reason: disable pylint protected-access because we need to access _*
-#         attributes of a task result to implement this method.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
+        failed_checks = set()
+        playbook_context = None
+        # re: result attrs see top comment  # pylint: disable=protected-access
+        for failure in failures:
+            # get context from check task result since callback plugins cannot access task vars
+            playbook_context = playbook_context or failure['result']._result.get('playbook_context')
+            failed_checks.update(
+                name
+                for name, result in failure['result']._result.get('checks', {}).items()
+                if result.get('failed')
+            )
+        if failed_checks:
+            self._print_check_failure_summary(failed_checks, playbook_context)
+
+    def _print_check_failure_summary(self, failed_checks, context):
+        checks = ','.join(sorted(failed_checks))
+        # NOTE: context is not set if all failures occurred prior to checks task
+        summary = (
+            '\n'
+            'The execution of "{playbook}"\n'
+            'includes checks designed to fail early if the requirements\n'
+            'of the playbook are not met. One or more of these checks\n'
+            'failed. To disregard these results, you may choose to\n'
+            'disable failing checks by setting an Ansible variable:\n\n'
+            '   openshift_disable_check={checks}\n\n'
+            'Failing check names are shown in the failure details above.\n'
+            'Some checks may be configurable by variables if your requirements\n'
+            'are different from the defaults; consult check documentation.\n'
+            'Variables can be set in the inventory or passed on the\n'
+            'command line using the -e flag to ansible-playbook.\n'
+        ).format(playbook=self._playbook_file, checks=checks)
+        if context in ['pre-install', 'health']:
+            summary = (
+                '\n'
+                'You may choose to configure or disable failing checks by\n'
+                'setting Ansible variables. To disable those above:\n\n'
+                '    openshift_disable_check={checks}\n\n'
+                'Consult check documentation for configurable variables.\n'
+                'Variables can be set in the inventory or passed on the\n'
+                'command line using the -e flag to ansible-playbook.\n'
+            ).format(checks=checks)
+        # other expected contexts: install, upgrade
+        self._display.display(summary)
+
+
+# re: result attrs see top comment  # pylint: disable=protected-access
 def _format_failure(failure):
     '''Return a list of pretty-formatted text entries describing a failure, including
     relevant information about it. Expect that the list of text entries will be joined
@@ -100,11 +146,8 @@ def _format_failed_checks(checks):
         return stringc(pformat(checks), C.COLOR_ERROR)
 
 
-# Reason: disable pylint protected-access because we need to access _*
-#         attributes of obj to implement this function.
-#         This is inspired by ansible.playbook.base.Base.dump_me.
-# Status: permanently disabled unless Ansible's API changes.
-# pylint: disable=protected-access
+# This is inspired by ansible.playbook.base.Base.dump_me.
+# re: play/task/block attrs see top comment  # pylint: disable=protected-access
 def _get_play(obj):
     '''Given a task or block, recursively tries to find its parent play.'''
     if hasattr(obj, '_play'):

+ 4 - 2
roles/openshift_health_checker/openshift_checks/disk_availability.py

@@ -27,10 +27,12 @@ class DiskAvailability(NotContainerizedMixin, OpenShiftCheck):
     def run(self, tmp, task_vars):
         group_names = get_var(task_vars, "group_names")
         ansible_mounts = get_var(task_vars, "ansible_mounts")
-
-        min_free_bytes = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
         free_bytes = self.openshift_available_disk(ansible_mounts)
 
+        recommended_min = max(self.recommended_disk_space_bytes.get(name, 0) for name in group_names)
+        configured_min = int(get_var(task_vars, "openshift_check_min_host_disk_gb", default=0)) * 10**9
+        min_free_bytes = configured_min or recommended_min
+
         if free_bytes < min_free_bytes:
             return {
                 'failed': True,

+ 4 - 2
roles/openshift_health_checker/openshift_checks/memory_availability.py

@@ -13,7 +13,7 @@ class MemoryAvailability(OpenShiftCheck):
     recommended_memory_bytes = {
         "masters": 16 * 10**9,
         "nodes": 8 * 10**9,
-        "etcd": 20 * 10**9,
+        "etcd": 8 * 10**9,
     }
 
     @classmethod
@@ -27,7 +27,9 @@ class MemoryAvailability(OpenShiftCheck):
         group_names = get_var(task_vars, "group_names")
         total_memory_bytes = get_var(task_vars, "ansible_memtotal_mb") * 10**6
 
-        min_memory_bytes = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        recommended_min = max(self.recommended_memory_bytes.get(name, 0) for name in group_names)
+        configured_min = int(get_var(task_vars, "openshift_check_min_host_memory_gb", default=0)) * 10**9
+        min_memory_bytes = configured_min or recommended_min
 
         if total_memory_bytes < min_memory_bytes:
             return {

+ 15 - 1
roles/openshift_health_checker/test/action_plugin_test.py

@@ -67,6 +67,7 @@ def changed(result):
     return result.get('changed', False)
 
 
+# tests whether task is skipped, not individual checks
 def skipped(result):
     return result.get('skipped', False)
 
@@ -101,7 +102,20 @@ def test_action_plugin_skip_non_active_checks(plugin, task_vars, monkeypatch):
 
     result = plugin.run(tmp=None, task_vars=task_vars)
 
-    assert result['checks']['fake_check'] == {'skipped': True}
+    assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Not active for this host")
+    assert not failed(result)
+    assert not changed(result)
+    assert not skipped(result)
+
+
+def test_action_plugin_skip_disabled_checks(plugin, task_vars, monkeypatch):
+    checks = [fake_check('fake_check', is_active=True)]
+    monkeypatch.setattr('openshift_checks.OpenShiftCheck.subclasses', classmethod(lambda cls: checks))
+
+    task_vars['openshift_disable_check'] = 'fake_check'
+    result = plugin.run(tmp=None, task_vars=task_vars)
+
+    assert result['checks']['fake_check'] == dict(skipped=True, skipped_reason="Disabled by user request")
     assert not failed(result)
     assert not changed(result)
     assert not skipped(result)

+ 32 - 4
roles/openshift_health_checker/test/disk_availability_test.py

@@ -42,9 +42,10 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
         assert word in str(excinfo.value)
 
 
-@pytest.mark.parametrize('group_names,ansible_mounts', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_mounts', [
     (
         ['masters'],
+        0,
         [{
             'mount': '/',
             'size_available': 40 * 10**9 + 1,
@@ -52,6 +53,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
     ),
     (
         ['nodes'],
+        0,
         [{
             'mount': '/',
             'size_available': 15 * 10**9 + 1,
@@ -59,6 +61,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
     ),
     (
         ['etcd'],
+        0,
         [{
             'mount': '/',
             'size_available': 20 * 10**9 + 1,
@@ -66,6 +69,15 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
     ),
     (
         ['etcd'],
+        1,  # configure lower threshold
+        [{
+            'mount': '/',
+            'size_available': 1 * 10**9 + 1,  # way smaller than recommended
+        }],
+    ),
+    (
+        ['etcd'],
+        0,
         [{
             # not enough space on / ...
             'mount': '/',
@@ -77,9 +89,10 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words):
         }],
     ),
 ])
-def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
+def test_succeeds_with_recommended_disk_space(group_names, configured_min, ansible_mounts):
     task_vars = dict(
         group_names=group_names,
+        openshift_check_min_host_disk_gb=configured_min,
         ansible_mounts=ansible_mounts,
     )
 
@@ -89,9 +102,10 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
     assert not result.get('failed', False)
 
 
-@pytest.mark.parametrize('group_names,ansible_mounts,extra_words', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_mounts,extra_words', [
     (
         ['masters'],
+        0,
         [{
             'mount': '/',
             'size_available': 1,
@@ -99,7 +113,17 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
         ['0.0 GB'],
     ),
     (
+        ['masters'],
+        100,  # set a higher threshold
+        [{
+            'mount': '/',
+            'size_available': 50 * 10**9,  # would normally be enough...
+        }],
+        ['100.0 GB'],
+    ),
+    (
         ['nodes'],
+        0,
         [{
             'mount': '/',
             'size_available': 1 * 10**9,
@@ -108,6 +132,7 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
     ),
     (
         ['etcd'],
+        0,
         [{
             'mount': '/',
             'size_available': 1,
@@ -116,6 +141,7 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
     ),
     (
         ['nodes', 'masters'],
+        0,
         [{
             'mount': '/',
             # enough space for a node, not enough for a master
@@ -125,6 +151,7 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
     ),
     (
         ['etcd'],
+        0,
         [{
             # enough space on / ...
             'mount': '/',
@@ -137,9 +164,10 @@ def test_succeeds_with_recommended_disk_space(group_names, ansible_mounts):
         ['0.0 GB'],
     ),
 ])
-def test_fails_with_insufficient_disk_space(group_names, ansible_mounts, extra_words):
+def test_fails_with_insufficient_disk_space(group_names, configured_min, ansible_mounts, extra_words):
     task_vars = dict(
         group_names=group_names,
+        openshift_check_min_host_disk_gb=configured_min,
         ansible_mounts=ansible_mounts,
     )
 

+ 35 - 8
roles/openshift_health_checker/test/memory_availability_test.py

@@ -20,27 +20,37 @@ def test_is_active(group_names, is_active):
     assert MemoryAvailability.is_active(task_vars=task_vars) == is_active
 
 
-@pytest.mark.parametrize('group_names,ansible_memtotal_mb', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb', [
     (
         ['masters'],
+        0,
         17200,
     ),
     (
         ['nodes'],
+        0,
         8200,
     ),
     (
+        ['nodes'],
+        1,  # configure lower threshold
+        2000,  # too low for recommended but not for configured
+    ),
+    (
         ['etcd'],
-        22200,
+        0,
+        8200,
     ),
     (
         ['masters', 'nodes'],
+        0,
         17000,
     ),
 ])
-def test_succeeds_with_recommended_memory(group_names, ansible_memtotal_mb):
+def test_succeeds_with_recommended_memory(group_names, configured_min, ansible_memtotal_mb):
     task_vars = dict(
         group_names=group_names,
+        openshift_check_min_host_memory_gb=configured_min,
         ansible_memtotal_mb=ansible_memtotal_mb,
     )
 
@@ -50,39 +60,56 @@ def test_succeeds_with_recommended_memory(group_names, ansible_memtotal_mb):
     assert not result.get('failed', False)
 
 
-@pytest.mark.parametrize('group_names,ansible_memtotal_mb,extra_words', [
+@pytest.mark.parametrize('group_names,configured_min,ansible_memtotal_mb,extra_words', [
     (
         ['masters'],
         0,
+        0,
         ['0.0 GB'],
     ),
     (
         ['nodes'],
+        0,
         100,
         ['0.1 GB'],
     ),
     (
+        ['nodes'],
+        24,  # configure higher threshold
+        20000,  # enough to meet recommended but not configured
+        ['20.0 GB'],
+    ),
+    (
         ['etcd'],
-        -1,
-        ['0.0 GB'],
+        0,
+        7000,
+        ['7.0 GB'],
+    ),
+    (
+        ['etcd', 'masters'],
+        0,
+        9000,  # enough memory for etcd, not enough for a master
+        ['9.0 GB'],
     ),
     (
         ['nodes', 'masters'],
+        0,
         # enough memory for a node, not enough for a master
         11000,
         ['11.0 GB'],
     ),
 ])
-def test_fails_with_insufficient_memory(group_names, ansible_memtotal_mb, extra_words):
+def test_fails_with_insufficient_memory(group_names, configured_min, ansible_memtotal_mb, extra_words):
     task_vars = dict(
         group_names=group_names,
+        openshift_check_min_host_memory_gb=configured_min,
         ansible_memtotal_mb=ansible_memtotal_mb,
     )
 
     check = MemoryAvailability(execute_module=fake_execute_module)
     result = check.run(tmp=None, task_vars=task_vars)
 
-    assert result['failed']
+    assert result.get('failed', False)
     for word in 'below recommended'.split() + extra_words:
         assert word in result['msg']