Browse Source

Updating process for doing a rolling and full cluster upgrades

Eric Wolinetz 6 years ago
parent
commit
770bc1d3d9

+ 9 - 1
roles/openshift_logging/tasks/main.yaml

@@ -97,7 +97,15 @@
   - not openshift_logging_install_logging | default(false) | bool
 
 - name: Cleaning up local temp dir
-  local_action: file path="{{local_tmp.stdout}}" state=absent
+  local_action: file path="{{ local_tmp.stdout }}" state=absent
+  tags: logging_cleanup
+  changed_when: False
+  become: false
+
+- name: Cleaning up temp dir
+  file:
+    path: "{{ mktemp.stdout }}"
+    state: absent
   tags: logging_cleanup
   changed_when: False
   become: false

+ 219 - 0
roles/openshift_logging_elasticsearch/tasks/full_cluster_restart.yml

@@ -0,0 +1,219 @@
+---
+# Disable external communication for {{ _cluster_component }}
+- name: Disable external communication for logging-{{ _cluster_component }}
+  oc_service:
+    state: present
+    name: "logging-{{ _cluster_component }}"
+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
+    selector:
+      component: "{{ _cluster_component }}"
+      provider: openshift
+      connection: blocked
+    labels:
+      logging-infra: 'support'
+    ports:
+    - port: 9200
+      targetPort: "restapi"
+
+- command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    get pod
+    -l component={{ _cluster_component }},provider=openshift
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
+  register: _cluster_pods
+
+- name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
+    -c elasticsearch
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -- es_util --query=_cluster/settings -XPUT -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
+  register: _disable_output
+  changed_when:
+  - "_disable_output.stdout != ''"
+  - (_disable_output.stdout | from_json)['acknowledged'] | bool
+
+# Flush ES
+# This is documented as a best effort, if it fails, we are okay with that
+- name: "Flushing for logging-{{ _cluster_component }} cluster"
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
+    -c elasticsearch
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -- es_util --query=_flush/synced -XPOST
+  register: _flush_output
+  changed_when:
+  - "_flush_output.stdout != ''"
+  - (_flush_output.stdout | from_json)['_shards']['successful'] > 0
+  failed_when: false
+
+# Stop all nodes, then rollout all nodes
+- name: Ready all nodes for scale down
+  shell: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    annotate "dc/{{ _es_node }}"
+    prior-replica-count=$({{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n {{ openshift_logging_elasticsearch_namespace }} get "dc/{{ _es_node }}" -o jsonpath='{.spec.replicas}')
+    --overwrite
+  with_items: "{{ logging_restart_cluster_dcs }}"
+  loop_control:
+    loop_var: _es_node
+
+- name: Scale down all nodes
+  oc_scale:
+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
+    kind: dc
+    name: "{{ _es_node }}"
+    replicas: 0
+  with_items: "{{ logging_restart_cluster_dcs }}"
+  loop_control:
+    loop_var: _es_node
+
+- name: Rollout all updated DCs
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    rollout latest {{ _es_node }}
+    -n {{ openshift_logging_elasticsearch_namespace }}
+  with_items: "{{ logging_restart_cluster_dcs }}"
+  loop_control:
+    loop_var: _es_node
+
+- name: Scale up all nodes to previous replicas
+  shell: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    scale "dc/{{ _es_node }}"
+    --replicas=$({{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n {{ openshift_logging_elasticsearch_namespace }} get "dc/{{ _es_node }}" -o jsonpath='{.metadata.annotations.prior-replica-count}')
+  with_items: "{{ logging_restart_cluster_dcs }}"
+  loop_control:
+    loop_var: _es_node
+
+# Wait for all nodes to be deployed/ready again
+- name: "Waiting for {{ _es_node }} to finish scaling up"
+  oc_obj:
+    state: list
+    name: "{{ _es_node }}"
+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
+    kind: dc
+  register: _dc_output
+  until:
+  - _dc_output.results.results[0].status is defined
+  - _dc_output.results.results[0].status.readyReplicas is defined
+  - _dc_output.results.results[0].status.readyReplicas > 0
+  - _dc_output.results.results[0].status.updatedReplicas is defined
+  - _dc_output.results.results[0].status.updatedReplicas > 0
+  retries: 60
+  delay: 30
+  with_items: "{{ logging_restart_cluster_dcs }}"
+  loop_control:
+    loop_var: _es_node
+  failed_when: false
+
+- when:
+  - _dc_output.failed is defined
+  - _dc_output.failed
+  name: Manual intervention required
+  run_once: true
+  set_stats:
+    data:
+      installer_phase_logging:
+        message: "Node in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+
+- command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    get pod
+    -l component={{ _cluster_component }},provider=openshift
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
+  register: _cluster_pods
+
+- name: Wait for cluster to be in at least yellow state
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
+    -c elasticsearch
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -- es_cluster_health
+  register: _pod_status
+  until:
+  - "_pod_status.stdout != ''"
+  - (_pod_status.stdout | from_json)['status'] in ['yellow', 'green']
+  retries: "{{ __elasticsearch_ready_retries }}"
+  delay: 30
+  changed_when: false
+  failed_when: false
+
+- when:
+  - _pod_status.failed is defined
+  - _pod_status.failed
+  run_once: true
+  set_stats:
+    data:
+      installer_phase_logging:
+        message: "Cluster logging-{{ _cluster_component }} was unable to recover to at least a yellow state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+
+- name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
+    -c elasticsearch
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -- es_util --query=_cluster/settings -XPUT -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
+  register: _enable_output
+  changed_when:
+  - "_enable_output.stdout != ''"
+  - (_enable_output.stdout | from_json)['acknowledged'] | bool
+
+# Skip healthcheck for a full cluster restart always since it could take a long time to recover?
+- name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
+    -c elasticsearch
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -- es_cluster_health
+  register: _pod_status
+  until:
+  - "_pod_status.stdout != ''"
+  - (_pod_status.stdout | from_json)['status'] in ['green']
+  retries: "{{ __elasticsearch_ready_retries }}"
+  delay: 30
+  changed_when: false
+  failed_when: false
+
+- when:
+  - _pod_status.failed is defined
+  - _pod_status.failed
+  run_once: true
+  set_stats:
+    data:
+      installer_phase_logging:
+        message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+
+# Reenable external communication for {{ _cluster_component }}
+- name: Reenable external communication for logging-{{ _cluster_component }}
+  oc_service:
+    state: present
+    name: "logging-{{ _cluster_component }}"
+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
+    selector:
+      component: "{{ _cluster_component }}"
+      provider: openshift
+    labels:
+      logging-infra: 'support'
+    ports:
+    - port: 9200
+      targetPort: "restapi"

+ 53 - 90
roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml

@@ -1,120 +1,83 @@
 ---
 ## get all pods for the cluster
 - command: >
-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    get pod
+    -l component={{ _cluster_component }},provider=openshift
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
   register: _cluster_pods
 
-### Check for cluster state before making changes -- if its red then we don't want to continue
+# make a temp dir for admin certs
+- command: mktemp -d /tmp/openshift-logging-ansible-XXXXXX
+  register: _logging_handler_tempdir
+  changed_when: False
+  check_mode: no
+
+- name: Exporting secrets to use communicating with the ES cluster
+  command: >
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    extract secret/logging-elasticsearch
+    -n {{ openshift_logging_elasticsearch_namespace }}
+    --keys=admin-cert --keys=admin-key
+    --to={{ _logging_handler_tempdir.stdout }}
+
+### Check for cluster state before making changes -- if its red, yellow or missing nodes then we don't want to continue
 - name: "Checking current health for {{ _es_node }} cluster"
-  shell: >
-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec "{{ _cluster_pods.stdout.split(' ')[0] }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
   register: _pod_status
   when: _cluster_pods.stdout_lines | count > 0
 
 - when:
   - _pod_status.stdout is defined
-  - (_pod_status.stdout | from_json)['status'] in ['red']
+  - (_pod_status.stdout | from_json)['status'] in ['yellow', 'red'] or (_pod_status.stdout | from_json)['number_of_nodes'] != _cluster_pods.stdout_lines | count
   block:
   - name: Set Logging message to manually restart
     run_once: true
     set_stats:
       data:
         installer_phase_logging:
-          message: "Cluster logging-{{ _cluster_component }} was in a red state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+          message: "Cluster logging-{{ _cluster_component }} was not in an optimal state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
 
-  - debug: msg="Cluster logging-{{ _cluster_component }} was in a red state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+  - debug: msg="Cluster logging-{{ _cluster_component }} was not in an optimal state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
 
-- when: _pod_status.stdout is undefined or (_pod_status.stdout | from_json)['status'] in ['green', 'yellow']
+- when: _pod_status.stdout is undefined or ( (_pod_status.stdout | from_json)['status'] in ['green'] and (_pod_status.stdout | from_json)['number_of_nodes'] == _cluster_pods.stdout_lines | count )
   block:
-  # Disable external communication for {{ _cluster_component }}
-  - name: Disable external communication for logging-{{ _cluster_component }}
-    oc_service:
-      state: present
-      name: "logging-{{ _cluster_component }}"
-      namespace: "{{ openshift_logging_elasticsearch_namespace }}"
-      selector:
-        component: "{{ _cluster_component }}"
-        provider: openshift
-        connection: blocked
-      labels:
-        logging-infra: 'support'
-      ports:
-      - port: 9200
-        targetPort: "restapi"
-    when:
-    - full_restart_cluster | bool
-
-  - name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
-    command: >
-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
-    register: _disable_output
-    changed_when: "'\"acknowledged\":true' in _disable_output.stdout"
-    when: _cluster_pods.stdout_lines | count > 0
-
-  # Flush ES
-  - name: "Flushing for logging-{{ _cluster_component }} cluster"
-    command: >
-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_flush/synced'
-    register: _flush_output
-    changed_when: "'\"acknowledged\":true' in _flush_output.stdout"
-    when:
-    - _cluster_pods.stdout_lines | count > 0
-    - full_restart_cluster | bool
-
   - command: >
-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get dc -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
+      {{ openshift_client_binary }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      get dc
+      -l component={{ _cluster_component }},provider=openshift
+      -n {{ openshift_logging_elasticsearch_namespace }}
+      -o jsonpath={.items[*].metadata.name}
     register: _cluster_dcs
 
-  # If we are currently restarting the "es" cluster we want to check if we are scaling up the number of es nodes
-  # If we are currently restarting the "es-ops" cluster we want to check if we are scaling up the number of ops nodes
-  # If we've created a new node for that cluster then the appropriate variable will be true, otherwise we default to false
-  - set_fact:
-      _skip_healthcheck: "{{ ( __logging_scale_up | default(false) ) if _cluster_component == 'es' else ( __logging_ops_scale_up | default(false) ) }}"
-
   ## restart all dcs for full restart
-  - name: "Restart ES node {{ _es_node }}"
-    include_tasks: restart_es_node.yml
-    with_items: "{{ _cluster_dcs.stdout_lines }}"
-    loop_control:
-      loop_var: _es_node
+  - name: "Performing full cluster restart for {{ _cluster_component }} cluster"
+    include_tasks: full_cluster_restart.yml
+    vars:
+      logging_restart_cluster_dcs: "{{ _cluster_dcs.stdout_lines }}"
     when:
     - full_restart_cluster | bool
 
-  ## restart the node if it's dc is in the list of nodes to restart?
-  - name: "Restart ES node {{ _es_node }}"
-    include_tasks: restart_es_node.yml
-    with_items: "{{ _restart_logging_nodes }}"
-    loop_control:
-      loop_var: _es_node
+  ## restart the node if it's dc is in the list of nodes to restart
+  - name: "Performing rolling cluster restart for {{ _cluster_component }} cluster"
+    include_tasks: rolling_cluster_restart.yml
+    vars:
+      logging_restart_cluster_dcs: "{{ _restart_logging_nodes | intersect(_cluster_dcs.stdout) }}"
     when:
     - not full_restart_cluster | bool
-    - _es_node in _cluster_dcs.stdout
-
-  ## we may need a new first pod to run against -- fetch them all again
-  - command: >
-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
-    register: _cluster_pods
 
-  - name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
-    command: >
-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
-    register: _enable_output
-    changed_when: "'\"acknowledged\":true' in _enable_output.stdout"
-    when: _cluster_pods.stdout != ""
-
-  # Reenable external communication for {{ _cluster_component }}
-  - name: Reenable external communication for logging-{{ _cluster_component }}
-    oc_service:
-      state: present
-      name: "logging-{{ _cluster_component }}"
-      namespace: "{{ openshift_logging_elasticsearch_namespace }}"
-      selector:
-        component: "{{ _cluster_component }}"
-        provider: openshift
-      labels:
-        logging-infra: 'support'
-      ports:
-      - port: 9200
-        targetPort: "restapi"
-    when:
-    - full_restart_cluster | bool
+# remove temp dir
+- name: Cleaning up local temp dir
+  file:
+    path: "{{ _logging_handler_tempdir.stdout }}"
+    state: absent
+  changed_when: False
+  become: false

+ 71 - 15
roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml

@@ -1,10 +1,27 @@
 ---
+# we want to ignore if this fails because it is possible to fail on the first node when installing -- should we not do this if we're skipping health checks? -- is this required for cluster sanity?
+- name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
+    -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
+  register: _disable_output
+  changed_when:
+    - "_disable_output.stdout != ''"
+    - (_disable_output.stdout | from_json)['acknowledged'] | bool
+  failed_when: false
+
 - name: "Rolling out new pod(s) for {{ _es_node }}"
   command: >
-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig rollout latest {{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }}
+    {{ openshift_client_binary }}
+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+    rollout latest {{ _es_node }}
+    -n {{ openshift_logging_elasticsearch_namespace }}
 
-- when: not _skip_healthcheck | bool
-  name: "Waiting for {{ _es_node }} to finish scaling up"
+# always wait for this to scale up
+- name: "Waiting for {{ _es_node }} to finish scaling up"
   oc_obj:
     state: list
     name: "{{ _es_node }}"
@@ -19,23 +36,62 @@
     - _dc_output.results.results[0].status.updatedReplicas > 0
   retries: 60
   delay: 30
+  failed_when: false
 
-- when: not _skip_healthcheck | bool
-  name: Gettings name(s) of replica pod(s)
+- when:
+    - _dc_output.failed is defined
+    - _dc_output.failed
+  run_once: true
+  set_stats:
+    data:
+      installer_phase_logging:
+        message: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+
+- name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
   command: >
-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get pods -l deploymentconfig={{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
-  register: _pods
-  changed_when: false
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
+    -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
+  register: _enable_output
+  changed_when:
+    - "_enable_output.stdout != ''"
+    - (_enable_output.stdout | from_json)['acknowledged'] | bool
+
+# evaluate the RC for _dc_output
+- name: Evaluating status of rolled out pod
+  assert:
+    that: not _dc_output.failed
+    msg: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
 
 - when: not _skip_healthcheck | bool
-  name: "Waiting for ES node {{ _es_node }} health to be in ['green', 'yellow']"
-  shell: >
-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec "{{ _pod }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health
-  with_items: "{{ _pods.stdout.split(' ') }}"
-  loop_control:
-    loop_var: _pod
+  name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
   register: _pod_status
-  until: (_pod_status.stdout | from_json)['status'] in ['green', 'yellow']
+  until:
+    - "_pod_status.stdout != ''"
+    - (_pod_status.stdout | from_json)['status'] in ['green']
   retries: "{{ __elasticsearch_ready_retries }}"
   delay: 30
   changed_when: false
+  failed_when: false
+
+# evaluate RC for _pod_status
+- when:
+    - _pod_status.failed is defined
+    - _pod_status.failed
+  run_once: true
+  set_stats:
+    data:
+      installer_phase_logging:
+        message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
+
+- name: Evaluating cluster health
+  assert:
+    that: not _pod_status.failed
+    msg: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."

+ 26 - 0
roles/openshift_logging_elasticsearch/tasks/rolling_cluster_restart.yml

@@ -0,0 +1,26 @@
+---
+# If we are currently restarting the "es" cluster we want to check if we are scaling up the number of es nodes
+# If we are currently restarting the "es-ops" cluster we want to check if we are scaling up the number of ops nodes
+# If we've created a new node for that cluster then the appropriate variable will be true, otherwise we default to false
+- set_fact:
+    _skip_healthcheck: "{{ ( __logging_scale_up | default(false) ) if _cluster_component == 'es' else ( __logging_ops_scale_up | default(false) ) }}"
+
+# Flush ES
+# It is possible for this to fail on a brand new cluster, so don't fail then
+- name: "Flushing for logging-{{ _cluster_component }} cluster"
+  command: >
+    curl -s -k
+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
+    -XPOST 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_flush/synced'
+  register: _flush_output
+  changed_when:
+  - "_flush_output.stdout != ''"
+  - (_flush_output.stdout | from_json)['_shards']['successful'] > 0
+  failed_when: false
+
+# Loop over each DC for restart_es_node.yml
+- include_tasks: restart_es_node.yml
+  with_items: "{{ logging_restart_cluster_dcs }}"
+  loop_control:
+    loop_var: _es_node

+ 1 - 1
roles/openshift_logging_elasticsearch/vars/main.yml

@@ -5,7 +5,7 @@ __kibana_index_modes: ["unique", "shared_ops"]
 
 __es_local_curl: "curl -s --cacert /etc/elasticsearch/secret/admin-ca --cert /etc/elasticsearch/secret/admin-cert --key /etc/elasticsearch/secret/admin-key"
 
-__elasticsearch_ready_retries: "{{ openshift_logging_elasticsearch_poll_timeout_minutes | default(20) | int * 2 }}"
+__elasticsearch_ready_retries: "{{ openshift_logging_elasticsearch_poll_timeout_minutes | default(60) | int * 2 }}"
 
 # TODO: integrate these
 es_node_quorum: "{{ openshift_logging_elasticsearch_replica_count | int/2 + 1 }}"