Browse Source

Merge pull request #8522 from ewolinetz/logging_handler_skip_for_scaleup

Updating logic when we are scaling up to skip health checks
OpenShift Merge Robot 6 years ago
parent
commit
924b90d16d

+ 10 - 10
roles/openshift_logging_elasticsearch/tasks/full_cluster_restart.yml

@@ -145,18 +145,18 @@
     -c elasticsearch
     -n {{ openshift_logging_elasticsearch_namespace }}
     -- es_cluster_health
-  register: _pod_status
+  register: _cluster_status
   until:
-  - "_pod_status.stdout != ''"
-  - (_pod_status.stdout | from_json)['status'] in ['yellow', 'green']
+  - "_cluster_status.stdout != ''"
+  - (_cluster_status.stdout | from_json)['status'] in ['yellow', 'green']
   retries: "{{ __elasticsearch_ready_retries }}"
   delay: 30
   changed_when: false
   failed_when: false
 
 - when:
-  - _pod_status.failed is defined
-  - _pod_status.failed
+  - _cluster_status.failed is defined
+  - _cluster_status.failed
   run_once: true
   set_stats:
     data:
@@ -185,18 +185,18 @@
     -c elasticsearch
     -n {{ openshift_logging_elasticsearch_namespace }}
     -- es_cluster_health
-  register: _pod_status
+  register: _cluster_status
   until:
-  - "_pod_status.stdout != ''"
-  - (_pod_status.stdout | from_json)['status'] in ['green']
+  - "_cluster_status.stdout != ''"
+  - (_cluster_status.stdout | from_json)['status'] in ['green']
   retries: "{{ __elasticsearch_ready_retries }}"
   delay: 30
   changed_when: false
   failed_when: false
 
 - when:
-  - _pod_status.failed is defined
-  - _pod_status.failed
+  - _cluster_status.failed is defined
+  - _cluster_status.failed
   run_once: true
   set_stats:
     data:

+ 13 - 11
roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml

@@ -1,6 +1,7 @@
 ---
 # we want to ignore if this fails because it is possible to fail on the first node when installing -- should we not do this if we're skipping health checks? -- is this required for cluster sanity?
-- name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
+- when: not _skip_healthcheck | bool
+  name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
   command: >
     curl -s -k
     --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
@@ -20,8 +21,8 @@
     rollout latest {{ _es_node }}
     -n {{ openshift_logging_elasticsearch_namespace }}
 
-# always wait for this to scale up
-- name: "Waiting for {{ _es_node }} to finish scaling up"
+- when: not _skip_healthcheck | bool
+  name: "Waiting for {{ _es_node }} to finish scaling up"
   oc_obj:
     state: list
     name: "{{ _es_node }}"
@@ -47,7 +48,8 @@
       installer_phase_logging:
         message: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
 
-- name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
+- when: not _skip_healthcheck | bool
+  name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
   command: >
     curl -s -k
     --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
@@ -72,19 +74,19 @@
     --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
     --key {{ _logging_handler_tempdir.stdout }}/admin-key
     https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
-  register: _pod_status
+  register: _cluster_status
   until:
-    - "_pod_status.stdout != ''"
-    - (_pod_status.stdout | from_json)['status'] in ['green']
+    - "_cluster_status.stdout != ''"
+    - (_cluster_status.stdout | from_json)['status'] in ['green']
   retries: "{{ __elasticsearch_ready_retries }}"
   delay: 30
   changed_when: false
   failed_when: false
 
-# evaluate RC for _pod_status
+# evaluate RC for _cluster_status
 - when:
-    - _pod_status.failed is defined
-    - _pod_status.failed
+    - _cluster_status.failed is defined
+    - _cluster_status.failed
   run_once: true
   set_stats:
     data:
@@ -93,5 +95,5 @@
 
 - name: Evaluating cluster health
   assert:
-    that: _pod_status.failed is undefined or not _pod_status.failed
+    that: _cluster_status.failed is undefined or not _cluster_status.failed
     msg: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."

+ 52 - 0
roles/openshift_logging_elasticsearch/tasks/rolling_cluster_restart.yml

@@ -19,8 +19,60 @@
   - (_flush_output.stdout | from_json)['_shards']['successful'] > 0
   failed_when: false
 
+# if we are skipping the health check, then we should only disable and enable shard allocation once for the cluster
+- when: _skip_healthcheck | bool
+  name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
+    -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
+  register: _cluster_disable_output
+  changed_when:
+  - "_cluster_disable_output.stdout != ''"
+  - (_cluster_disable_output.stdout | from_json)['acknowledged'] | bool
+  failed_when: false
+
 # Loop over each DC for restart_es_node.yml
 - include_tasks: restart_es_node.yml
   with_items: "{{ logging_restart_cluster_dcs }}"
   loop_control:
     loop_var: _es_node
+
+# if we are skipping the health check, then we should only disable and enable shard allocation once for the cluster
+- when:
+  - _skip_healthcheck | bool
+  - "_cluster_disable_output.stdout != ''"
+  - (_cluster_disable_output.stdout | from_json)['acknowledged'] | bool
+  name: "Waiting for ES cluster logging{{ _cluster_component }} to be up"
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    --max-time 30
+    -o /dev/null \
+    -w '%{response_code}'
+    https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/
+  register: _cluster_status
+  until: "_cluster_status.stdout == '200'"
+  retries: "{{ __elasticsearch_ready_retries }}"
+  delay: 30
+  changed_when: false
+  failed_when: false
+
+- when:
+  - _skip_healthcheck | bool
+  - "_cluster_disable_output.stdout != ''"
+  - (_cluster_disable_output.stdout | from_json)['acknowledged'] | bool
+  name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
+    -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
+  register: _cluster_enable_output
+  changed_when:
+  - "_cluster_enable_output.stdout != ''"
+  - (_cluster_enable_output.stdout | from_json)['acknowledged'] | bool