7 years ago · b033df64d0
--- a/roles/openshift_logging/tasks/main.yaml
+++ b/roles/openshift_logging/tasks/main.yaml
@@ -97,7 +97,15 @@
 
																   - not openshift_logging_install_logging | default(false) | bool
															
 
																 - name: Cleaning up local temp dir
															
 
																-  local_action: file path="{{local_tmp.stdout}}" state=absent
															
 
																+  local_action: file path="{{ local_tmp.stdout }}" state=absent
															
 
																+  tags: logging_cleanup
															
 
																+  changed_when: False
															
 
																+  become: false
															
 
																+
															
 
																+- name: Cleaning up temp dir
															
 
																+  file:
															
 
																+    path: "{{ mktemp.stdout }}"
															
 
																+    state: absent
															
 
																   tags: logging_cleanup
															
 
																   changed_when: False
															
 
																   become: false
															
--- a/roles/openshift_logging_elasticsearch/tasks/full_cluster_restart.yml
+++ b/roles/openshift_logging_elasticsearch/tasks/full_cluster_restart.yml
@@ -0,0 +1,219 @@
 
																+---
															
 
																+# Disable external communication for {{ _cluster_component }}
															
 
																+- name: Disable external communication for logging-{{ _cluster_component }}
															
 
																+  oc_service:
															
 
																+    state: present
															
 
																+    name: "logging-{{ _cluster_component }}"
															
 
																+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
															
 
																+    selector:
															
 
																+      component: "{{ _cluster_component }}"
															
 
																+      provider: openshift
															
 
																+      connection: blocked
															
 
																+    labels:
															
 
																+      logging-infra: 'support'
															
 
																+    ports:
															
 
																+    - port: 9200
															
 
																+      targetPort: "restapi"
															
 
																+
															
 
																+- command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    get pod
															
 
																+    -l component={{ _cluster_component }},provider=openshift
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
															
 
																+  register: _cluster_pods
															
 
																+
															
 
																+- name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
															
 
																+    -c elasticsearch
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -- es_util --query=_cluster/settings -XPUT -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
															
 
																+  register: _disable_output
															
 
																+  changed_when:
															
 
																+  - "_disable_output.stdout != ''"
															
 
																+  - (_disable_output.stdout | from_json)['acknowledged'] | bool
															
 
																+
															
 
																+# Flush ES
															
 
																+# This is documented as a best effort, if it fails, we are okay with that
															
 
																+- name: "Flushing for logging-{{ _cluster_component }} cluster"
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
															
 
																+    -c elasticsearch
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -- es_util --query=_flush/synced -XPOST
															
 
																+  register: _flush_output
															
 
																+  changed_when:
															
 
																+  - "_flush_output.stdout != ''"
															
 
																+  - (_flush_output.stdout | from_json)['_shards']['successful'] > 0
															
 
																+  failed_when: false
															
 
																+
															
 
																+# Stop all nodes, then rollout all nodes
															
 
																+- name: Ready all nodes for scale down
															
 
																+  shell: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    annotate "dc/{{ _es_node }}"
															
 
																+    prior-replica-count=$({{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n {{ openshift_logging_elasticsearch_namespace }} get "dc/{{ _es_node }}" -o jsonpath='{.spec.replicas}')
															
 
																+    --overwrite
															
 
																+  with_items: "{{ logging_restart_cluster_dcs }}"
															
 
																+  loop_control:
															
 
																+    loop_var: _es_node
															
 
																+
															
 
																+- name: Scale down all nodes
															
 
																+  oc_scale:
															
 
																+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
															
 
																+    kind: dc
															
 
																+    name: "{{ _es_node }}"
															
 
																+    replicas: 0
															
 
																+  with_items: "{{ logging_restart_cluster_dcs }}"
															
 
																+  loop_control:
															
 
																+    loop_var: _es_node
															
 
																+
															
 
																+- name: Rollout all updated DCs
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    rollout latest {{ _es_node }}
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+  with_items: "{{ logging_restart_cluster_dcs }}"
															
 
																+  loop_control:
															
 
																+    loop_var: _es_node
															
 
																+
															
 
																+- name: Scale up all nodes to previous replicas
															
 
																+  shell: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    scale "dc/{{ _es_node }}"
															
 
																+    --replicas=$({{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n {{ openshift_logging_elasticsearch_namespace }} get "dc/{{ _es_node }}" -o jsonpath='{.metadata.annotations.prior-replica-count}')
															
 
																+  with_items: "{{ logging_restart_cluster_dcs }}"
															
 
																+  loop_control:
															
 
																+    loop_var: _es_node
															
 
																+
															
 
																+# Wait for all nodes to be deployed/ready again
															
 
																+- name: "Waiting for {{ _es_node }} to finish scaling up"
															
 
																+  oc_obj:
															
 
																+    state: list
															
 
																+    name: "{{ _es_node }}"
															
 
																+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
															
 
																+    kind: dc
															
 
																+  register: _dc_output
															
 
																+  until:
															
 
																+  - _dc_output.results.results[0].status is defined
															
 
																+  - _dc_output.results.results[0].status.readyReplicas is defined
															
 
																+  - _dc_output.results.results[0].status.readyReplicas > 0
															
 
																+  - _dc_output.results.results[0].status.updatedReplicas is defined
															
 
																+  - _dc_output.results.results[0].status.updatedReplicas > 0
															
 
																+  retries: 60
															
 
																+  delay: 30
															
 
																+  with_items: "{{ logging_restart_cluster_dcs }}"
															
 
																+  loop_control:
															
 
																+    loop_var: _es_node
															
 
																+  failed_when: false
															
 
																+
															
 
																+- when:
															
 
																+  - _dc_output.failed is defined
															
 
																+  - _dc_output.failed
															
 
																+  name: Manual intervention required
															
 
																+  run_once: true
															
 
																+  set_stats:
															
 
																+    data:
															
 
																+      installer_phase_logging:
															
 
																+        message: "Node in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+
															
 
																+- command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    get pod
															
 
																+    -l component={{ _cluster_component }},provider=openshift
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
															
 
																+  register: _cluster_pods
															
 
																+
															
 
																+- name: Wait for cluster to be in at least yellow state
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
															
 
																+    -c elasticsearch
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -- es_cluster_health
															
 
																+  register: _pod_status
															
 
																+  until:
															
 
																+  - "_pod_status.stdout != ''"
															
 
																+  - (_pod_status.stdout | from_json)['status'] in ['yellow', 'green']
															
 
																+  retries: "{{ __elasticsearch_ready_retries }}"
															
 
																+  delay: 30
															
 
																+  changed_when: false
															
 
																+  failed_when: false
															
 
																+
															
 
																+- when:
															
 
																+  - _pod_status.failed is defined
															
 
																+  - _pod_status.failed
															
 
																+  run_once: true
															
 
																+  set_stats:
															
 
																+    data:
															
 
																+      installer_phase_logging:
															
 
																+        message: "Cluster logging-{{ _cluster_component }} was unable to recover to at least a yellow state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+
															
 
																+- name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
															
 
																+    -c elasticsearch
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -- es_util --query=_cluster/settings -XPUT -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
															
 
																+  register: _enable_output
															
 
																+  changed_when:
															
 
																+  - "_enable_output.stdout != ''"
															
 
																+  - (_enable_output.stdout | from_json)['acknowledged'] | bool
															
 
																+
															
 
																+# Skip healthcheck for a full cluster restart always since it could take a long time to recover?
															
 
																+- name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    exec {{ _cluster_pods.stdout.split(' ')[0] }}
															
 
																+    -c elasticsearch
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -- es_cluster_health
															
 
																+  register: _pod_status
															
 
																+  until:
															
 
																+  - "_pod_status.stdout != ''"
															
 
																+  - (_pod_status.stdout | from_json)['status'] in ['green']
															
 
																+  retries: "{{ __elasticsearch_ready_retries }}"
															
 
																+  delay: 30
															
 
																+  changed_when: false
															
 
																+  failed_when: false
															
 
																+
															
 
																+- when:
															
 
																+  - _pod_status.failed is defined
															
 
																+  - _pod_status.failed
															
 
																+  run_once: true
															
 
																+  set_stats:
															
 
																+    data:
															
 
																+      installer_phase_logging:
															
 
																+        message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+
															
 
																+# Reenable external communication for {{ _cluster_component }}
															
 
																+- name: Reenable external communication for logging-{{ _cluster_component }}
															
 
																+  oc_service:
															
 
																+    state: present
															
 
																+    name: "logging-{{ _cluster_component }}"
															
 
																+    namespace: "{{ openshift_logging_elasticsearch_namespace }}"
															
 
																+    selector:
															
 
																+      component: "{{ _cluster_component }}"
															
 
																+      provider: openshift
															
 
																+    labels:
															
 
																+      logging-infra: 'support'
															
 
																+    ports:
															
 
																+    - port: 9200
															
 
																+      targetPort: "restapi"
															
--- a/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml
+++ b/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml
@@ -1,120 +1,83 @@
 
																 ---
															
 
																 ## get all pods for the cluster
															
 
																 - command: >
															
 
																-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    get pod
															
 
																+    -l component={{ _cluster_component }},provider=openshift
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
															
 
																   register: _cluster_pods
															
 
																-### Check for cluster state before making changes -- if its red then we don't want to continue
															
 
																+# make a temp dir for admin certs
															
 
																+- command: mktemp -d /tmp/openshift-logging-ansible-XXXXXX
															
 
																+  register: _logging_handler_tempdir
															
 
																+  changed_when: False
															
 
																+  check_mode: no
															
 
																+
															
 
																+- name: Exporting secrets to use communicating with the ES cluster
															
 
																+  command: >
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    extract secret/logging-elasticsearch
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    --keys=admin-cert --keys=admin-key
															
 
																+    --to={{ _logging_handler_tempdir.stdout }}
															
 
																+
															
 
																+### Check for cluster state before making changes -- if its red, yellow or missing nodes then we don't want to continue
															
 
																 - name: "Checking current health for {{ _es_node }} cluster"
															
 
																-  shell: >
															
 
																-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec "{{ _cluster_pods.stdout.split(' ')[0] }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health
															
 
																+  command: >
															
 
																+    curl -s -k
															
 
																+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
															
 
																+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
															
 
																+    https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
															
 
																   register: _pod_status
															
 
																   when: _cluster_pods.stdout_lines | count > 0
															
 
																 - when:
															
 
																   - _pod_status.stdout is defined
															
 
																-  - (_pod_status.stdout | from_json)['status'] in ['red']
															
 
																+  - (_pod_status.stdout | from_json)['status'] in ['yellow', 'red'] or (_pod_status.stdout | from_json)['number_of_nodes'] != _cluster_pods.stdout_lines | count
															
 
																   block:
															
 
																   - name: Set Logging message to manually restart
															
 
																     run_once: true
															
 
																     set_stats:
															
 
																       data:
															
 
																         installer_phase_logging:
															
 
																-          message: "Cluster logging-{{ _cluster_component }} was in a red state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+          message: "Cluster logging-{{ _cluster_component }} was not in an optimal state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																-  - debug: msg="Cluster logging-{{ _cluster_component }} was in a red state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+  - debug: msg="Cluster logging-{{ _cluster_component }} was not in an optimal state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																-- when: _pod_status.stdout is undefined or (_pod_status.stdout | from_json)['status'] in ['green', 'yellow']
															
 
																+- when: _pod_status.stdout is undefined or ( (_pod_status.stdout | from_json)['status'] in ['green'] and (_pod_status.stdout | from_json)['number_of_nodes'] == _cluster_pods.stdout_lines | count )
															
 
																   block:
															
 
																-  # Disable external communication for {{ _cluster_component }}
															
 
																-  - name: Disable external communication for logging-{{ _cluster_component }}
															
 
																-    oc_service:
															
 
																-      state: present
															
 
																-      name: "logging-{{ _cluster_component }}"
															
 
																-      namespace: "{{ openshift_logging_elasticsearch_namespace }}"
															
 
																-      selector:
															
 
																-        component: "{{ _cluster_component }}"
															
 
																-        provider: openshift
															
 
																-        connection: blocked
															
 
																-      labels:
															
 
																-        logging-infra: 'support'
															
 
																-      ports:
															
 
																-      - port: 9200
															
 
																-        targetPort: "restapi"
															
 
																-    when:
															
 
																-    - full_restart_cluster | bool
															
 
																-
															
 
																-  - name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
															
 
																-    command: >
															
 
																-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
															
 
																-    register: _disable_output
															
 
																-    changed_when: "'\"acknowledged\":true' in _disable_output.stdout"
															
 
																-    when: _cluster_pods.stdout_lines | count > 0
															
 
																-
															
 
																-  # Flush ES
															
 
																-  - name: "Flushing for logging-{{ _cluster_component }} cluster"
															
 
																-    command: >
															
 
																-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_flush/synced'
															
 
																-    register: _flush_output
															
 
																-    changed_when: "'\"acknowledged\":true' in _flush_output.stdout"
															
 
																-    when:
															
 
																-    - _cluster_pods.stdout_lines | count > 0
															
 
																-    - full_restart_cluster | bool
															
 
																-
															
 
																   - command: >
															
 
																-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get dc -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
															
 
																+      {{ openshift_client_binary }}
															
 
																+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+      get dc
															
 
																+      -l component={{ _cluster_component }},provider=openshift
															
 
																+      -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+      -o jsonpath={.items[*].metadata.name}
															
 
																     register: _cluster_dcs
															
 
																-  # If we are currently restarting the "es" cluster we want to check if we are scaling up the number of es nodes
															
 
																-  # If we are currently restarting the "es-ops" cluster we want to check if we are scaling up the number of ops nodes
															
 
																-  # If we've created a new node for that cluster then the appropriate variable will be true, otherwise we default to false
															
 
																-  - set_fact:
															
 
																-      _skip_healthcheck: "{{ ( __logging_scale_up | default(false) ) if _cluster_component == 'es' else ( __logging_ops_scale_up | default(false) ) }}"
															
 
																-
															
 
																   ## restart all dcs for full restart
															
 
																-  - name: "Restart ES node {{ _es_node }}"
															
 
																-    include_tasks: restart_es_node.yml
															
 
																-    with_items: "{{ _cluster_dcs.stdout_lines }}"
															
 
																-    loop_control:
															
 
																-      loop_var: _es_node
															
 
																+  - name: "Performing full cluster restart for {{ _cluster_component }} cluster"
															
 
																+    include_tasks: full_cluster_restart.yml
															
 
																+    vars:
															
 
																+      logging_restart_cluster_dcs: "{{ _cluster_dcs.stdout_lines }}"
															
 
																     when:
															
 
																     - full_restart_cluster | bool
															
 
																-  ## restart the node if it's dc is in the list of nodes to restart?
															
 
																-  - name: "Restart ES node {{ _es_node }}"
															
 
																-    include_tasks: restart_es_node.yml
															
 
																-    with_items: "{{ _restart_logging_nodes }}"
															
 
																-    loop_control:
															
 
																-      loop_var: _es_node
															
 
																+  ## restart the node if it's dc is in the list of nodes to restart
															
 
																+  - name: "Performing rolling cluster restart for {{ _cluster_component }} cluster"
															
 
																+    include_tasks: rolling_cluster_restart.yml
															
 
																+    vars:
															
 
																+      logging_restart_cluster_dcs: "{{ _restart_logging_nodes | intersect(_cluster_dcs.stdout) }}"
															
 
																     when:
															
 
																     - not full_restart_cluster | bool
															
 
																-    - _es_node in _cluster_dcs.stdout
															
 
																-
															
 
																-  ## we may need a new first pod to run against -- fetch them all again
															
 
																-  - command: >
															
 
																-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
															
 
																-    register: _cluster_pods
															
 
																-  - name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
															
 
																-    command: >
															
 
																-      {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
															
 
																-    register: _enable_output
															
 
																-    changed_when: "'\"acknowledged\":true' in _enable_output.stdout"
															
 
																-    when: _cluster_pods.stdout != ""
															
 
																-
															
 
																-  # Reenable external communication for {{ _cluster_component }}
															
 
																-  - name: Reenable external communication for logging-{{ _cluster_component }}
															
 
																-    oc_service:
															
 
																-      state: present
															
 
																-      name: "logging-{{ _cluster_component }}"
															
 
																-      namespace: "{{ openshift_logging_elasticsearch_namespace }}"
															
 
																-      selector:
															
 
																-        component: "{{ _cluster_component }}"
															
 
																-        provider: openshift
															
 
																-      labels:
															
 
																-        logging-infra: 'support'
															
 
																-      ports:
															
 
																-      - port: 9200
															
 
																-        targetPort: "restapi"
															
 
																-    when:
															
 
																-    - full_restart_cluster | bool
															
 
																+# remove temp dir
															
 
																+- name: Cleaning up local temp dir
															
 
																+  file:
															
 
																+    path: "{{ _logging_handler_tempdir.stdout }}"
															
 
																+    state: absent
															
 
																+  changed_when: False
															
 
																+  become: false
															
--- a/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml
+++ b/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml
@@ -1,10 +1,27 @@
 
																 ---
															
 
																+# we want to ignore if this fails because it is possible to fail on the first node when installing -- should we not do this if we're skipping health checks? -- is this required for cluster sanity?
															
 
																+- name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
															
 
																+  command: >
															
 
																+    curl -s -k
															
 
																+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
															
 
																+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
															
 
																+    -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
															
 
																+    -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
															
 
																+  register: _disable_output
															
 
																+  changed_when:
															
 
																+    - "_disable_output.stdout != ''"
															
 
																+    - (_disable_output.stdout | from_json)['acknowledged'] | bool
															
 
																+  failed_when: false
															
 
																+
															
 
																 - name: "Rolling out new pod(s) for {{ _es_node }}"
															
 
																   command: >
															
 
																-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig rollout latest {{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																+    {{ openshift_client_binary }}
															
 
																+    --config={{ openshift.common.config_base }}/master/admin.kubeconfig
															
 
																+    rollout latest {{ _es_node }}
															
 
																+    -n {{ openshift_logging_elasticsearch_namespace }}
															
 
																-- when: not _skip_healthcheck | bool
															
 
																-  name: "Waiting for {{ _es_node }} to finish scaling up"
															
 
																+# always wait for this to scale up
															
 
																+- name: "Waiting for {{ _es_node }} to finish scaling up"
															
 
																   oc_obj:
															
 
																     state: list
															
 
																     name: "{{ _es_node }}"
															
@@ -19,23 +36,62 @@
 
																     - _dc_output.results.results[0].status.updatedReplicas > 0
															
 
																   retries: 60
															
 
																   delay: 30
															
 
																+  failed_when: false
															
 
																-- when: not _skip_healthcheck | bool
															
 
																-  name: Gettings name(s) of replica pod(s)
															
 
																+- when:
															
 
																+    - _dc_output.failed is defined
															
 
																+    - _dc_output.failed
															
 
																+  run_once: true
															
 
																+  set_stats:
															
 
																+    data:
															
 
																+      installer_phase_logging:
															
 
																+        message: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+
															
 
																+- name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
															
 
																   command: >
															
 
																-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig get pods -l deploymentconfig={{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
															
 
																-  register: _pods
															
 
																-  changed_when: false
															
 
																+    curl -s -k
															
 
																+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
															
 
																+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
															
 
																+    -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
															
 
																+    -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
															
 
																+  register: _enable_output
															
 
																+  changed_when:
															
 
																+    - "_enable_output.stdout != ''"
															
 
																+    - (_enable_output.stdout | from_json)['acknowledged'] | bool
															
 
																+
															
 
																+# evaluate the RC for _dc_output
															
 
																+- name: Evaluating status of rolled out pod
															
 
																+  assert:
															
 
																+    that: not _dc_output.failed
															
 
																+    msg: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																 - when: not _skip_healthcheck | bool
															
 
																-  name: "Waiting for ES node {{ _es_node }} health to be in ['green', 'yellow']"
															
 
																-  shell: >
															
 
																-    {{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig exec "{{ _pod }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health
															
 
																-  with_items: "{{ _pods.stdout.split(' ') }}"
															
 
																-  loop_control:
															
 
																-    loop_var: _pod
															
 
																+  name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
															
 
																+  command: >
															
 
																+    curl -s -k
															
 
																+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
															
 
																+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
															
 
																+    https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
															
 
																   register: _pod_status
															
 
																-  until: (_pod_status.stdout | from_json)['status'] in ['green', 'yellow']
															
 
																+  until:
															
 
																+    - "_pod_status.stdout != ''"
															
 
																+    - (_pod_status.stdout | from_json)['status'] in ['green']
															
 
																   retries: "{{ __elasticsearch_ready_retries }}"
															
 
																   delay: 30
															
 
																   changed_when: false
															
 
																+  failed_when: false
															
 
																+
															
 
																+# evaluate RC for _pod_status
															
 
																+- when:
															
 
																+    - _pod_status.failed is defined
															
 
																+    - _pod_status.failed
															
 
																+  run_once: true
															
 
																+  set_stats:
															
 
																+    data:
															
 
																+      installer_phase_logging:
															
 
																+        message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
 
																+
															
 
																+- name: Evaluating cluster health
															
 
																+  assert:
															
 
																+    that: not _pod_status.failed
															
 
																+    msg: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
															
--- a/roles/openshift_logging_elasticsearch/tasks/rolling_cluster_restart.yml
+++ b/roles/openshift_logging_elasticsearch/tasks/rolling_cluster_restart.yml
@@ -0,0 +1,26 @@
 
																+---
															
 
																+# If we are currently restarting the "es" cluster we want to check if we are scaling up the number of es nodes
															
 
																+# If we are currently restarting the "es-ops" cluster we want to check if we are scaling up the number of ops nodes
															
 
																+# If we've created a new node for that cluster then the appropriate variable will be true, otherwise we default to false
															
 
																+- set_fact:
															
 
																+    _skip_healthcheck: "{{ ( __logging_scale_up | default(false) ) if _cluster_component == 'es' else ( __logging_ops_scale_up | default(false) ) }}"
															
 
																+
															
 
																+# Flush ES
															
 
																+# It is possible for this to fail on a brand new cluster, so don't fail then
															
 
																+- name: "Flushing for logging-{{ _cluster_component }} cluster"
															
 
																+  command: >
															
 
																+    curl -s -k
															
 
																+    --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
															
 
																+    --key {{ _logging_handler_tempdir.stdout }}/admin-key
															
 
																+    -XPOST 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_flush/synced'
															
 
																+  register: _flush_output
															
 
																+  changed_when:
															
 
																+  - "_flush_output.stdout != ''"
															
 
																+  - (_flush_output.stdout | from_json)['_shards']['successful'] > 0
															
 
																+  failed_when: false
															
 
																+
															
 
																+# Loop over each DC for restart_es_node.yml
															
 
																+- include_tasks: restart_es_node.yml
															
 
																+  with_items: "{{ logging_restart_cluster_dcs }}"
															
 
																+  loop_control:
															
 
																+    loop_var: _es_node
															
--- a/roles/openshift_logging_elasticsearch/vars/main.yml
+++ b/roles/openshift_logging_elasticsearch/vars/main.yml
@@ -5,7 +5,7 @@ __kibana_index_modes: ["unique", "shared_ops"]
 
																 __es_local_curl: "curl -s --cacert /etc/elasticsearch/secret/admin-ca --cert /etc/elasticsearch/secret/admin-cert --key /etc/elasticsearch/secret/admin-key"
															
 
																-__elasticsearch_ready_retries: "{{ openshift_logging_elasticsearch_poll_timeout_minutes | default(20) | int * 2 }}"
															
 
																+__elasticsearch_ready_retries: "{{ openshift_logging_elasticsearch_poll_timeout_minutes | default(60) | int * 2 }}"
															
 
																 # TODO: integrate these
															
 
																 es_node_quorum: "{{ openshift_logging_elasticsearch_replica_count | int/2 + 1 }}"