full_cluster_restart.yml 8.1 KB


  1. ---
  2. # Disable external communication for {{ _cluster_component }}
  3. - name: Disable external communication for logging-{{ _cluster_component }}
  4. oc_service:
  5. state: present
  6. name: "logging-{{ _cluster_component }}"
  7. namespace: "{{ openshift_logging_elasticsearch_namespace }}"
  8. selector:
  9. component: "{{ _cluster_component }}"
  10. provider: openshift
  11. connection: blocked
  12. labels:
  13. logging-infra: 'support'
  14. ports:
  15. - port: 9200
  16. targetPort: "restapi"
  17. - command: >
  18. {{ openshift_client_binary }}
  19. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  20. get pod
  21. -l component={{ _cluster_component }},provider=openshift
  22. -n {{ openshift_logging_elasticsearch_namespace }}
  23. -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
  24. register: _cluster_pods
  25. - name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
  26. command: >
  27. {{ openshift_client_binary }}
  28. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  29. exec {{ _cluster_pods.stdout.split(' ')[0] }}
  30. -c elasticsearch
  31. -n {{ openshift_logging_elasticsearch_namespace }}
  32. -- es_util --query=_cluster/settings -XPUT -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
  33. register: _disable_output
  34. changed_when:
  35. - "_disable_output.stdout != ''"
  36. - (_disable_output.stdout | from_json)['acknowledged'] | bool
  37. # Flush ES
  38. # This is documented as a best effort, if it fails, we are okay with that
  39. - name: "Flushing for logging-{{ _cluster_component }} cluster"
  40. command: >
  41. {{ openshift_client_binary }}
  42. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  43. exec {{ _cluster_pods.stdout.split(' ')[0] }}
  44. -c elasticsearch
  45. -n {{ openshift_logging_elasticsearch_namespace }}
  46. -- es_util --query=_flush/synced -XPOST
  47. register: _flush_output
  48. changed_when:
  49. - "_flush_output.stdout != ''"
  50. - (_flush_output.stdout | from_json)['_shards']['successful'] > 0
  51. failed_when: false
  52. # Stop all nodes, then rollout all nodes
  53. - name: Ready all nodes for scale down
  54. shell: >
  55. {{ openshift_client_binary }}
  56. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  57. -n {{ openshift_logging_elasticsearch_namespace }}
  58. annotate "dc/{{ _es_node }}"
  59. prior-replica-count=$({{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n {{ openshift_logging_elasticsearch_namespace }} get "dc/{{ _es_node }}" -o jsonpath='{.spec.replicas}')
  60. --overwrite
  61. with_items: "{{ logging_restart_cluster_dcs }}"
  62. loop_control:
  63. loop_var: _es_node
  64. - name: Scale down all nodes
  65. oc_scale:
  66. namespace: "{{ openshift_logging_elasticsearch_namespace }}"
  67. kind: dc
  68. name: "{{ _es_node }}"
  69. replicas: 0
  70. with_items: "{{ logging_restart_cluster_dcs }}"
  71. loop_control:
  72. loop_var: _es_node
  73. - name: Rollout all updated DCs
  74. command: >
  75. {{ openshift_client_binary }}
  76. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  77. rollout latest {{ _es_node }}
  78. -n {{ openshift_logging_elasticsearch_namespace }}
  79. with_items: "{{ logging_restart_cluster_dcs }}"
  80. loop_control:
  81. loop_var: _es_node
  82. - name: Scale up all nodes to previous replicas
  83. shell: >
  84. {{ openshift_client_binary }}
  85. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  86. -n {{ openshift_logging_elasticsearch_namespace }}
  87. scale "dc/{{ _es_node }}"
  88. --replicas=$({{ openshift_client_binary }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n {{ openshift_logging_elasticsearch_namespace }} get "dc/{{ _es_node }}" -o jsonpath='{.metadata.annotations.prior-replica-count}')
  89. with_items: "{{ logging_restart_cluster_dcs }}"
  90. loop_control:
  91. loop_var: _es_node
  92. # Wait for all nodes to be deployed/ready again
  93. - name: "Waiting for {{ _es_node }} to finish scaling up"
  94. oc_obj:
  95. state: list
  96. name: "{{ _es_node }}"
  97. namespace: "{{ openshift_logging_elasticsearch_namespace }}"
  98. kind: dc
  99. register: _dc_output
  100. until:
  101. - _dc_output.results.results[0].status is defined
  102. - _dc_output.results.results[0].status.readyReplicas is defined
  103. - _dc_output.results.results[0].status.readyReplicas > 0
  104. - _dc_output.results.results[0].status.updatedReplicas is defined
  105. - _dc_output.results.results[0].status.updatedReplicas > 0
  106. retries: 60
  107. delay: 30
  108. with_items: "{{ logging_restart_cluster_dcs }}"
  109. loop_control:
  110. loop_var: _es_node
  111. failed_when: false
  112. - when:
  113. - _dc_output.failed is defined
  114. - _dc_output.failed
  115. name: Manual intervention required
  116. run_once: true
  117. set_stats:
  118. data:
  119. installer_phase_logging:
  120. message: "Node in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  121. - command: >
  122. {{ openshift_client_binary }}
  123. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  124. get pod
  125. -l component={{ _cluster_component }},provider=openshift
  126. -n {{ openshift_logging_elasticsearch_namespace }}
  127. -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name}
  128. register: _cluster_pods
  129. - name: Wait for cluster to be in at least yellow state
  130. command: >
  131. {{ openshift_client_binary }}
  132. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  133. exec {{ _cluster_pods.stdout.split(' ')[0] }}
  134. -c elasticsearch
  135. -n {{ openshift_logging_elasticsearch_namespace }}
  136. -- es_cluster_health
  137. register: _pod_status
  138. until:
  139. - "_pod_status.stdout != ''"
  140. - (_pod_status.stdout | from_json)['status'] in ['yellow', 'green']
  141. retries: "{{ __elasticsearch_ready_retries }}"
  142. delay: 30
  143. changed_when: false
  144. failed_when: false
  145. - when:
  146. - _pod_status.failed is defined
  147. - _pod_status.failed
  148. run_once: true
  149. set_stats:
  150. data:
  151. installer_phase_logging:
  152. message: "Cluster logging-{{ _cluster_component }} was unable to recover to at least a yellow state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  153. - name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
  154. command: >
  155. {{ openshift_client_binary }}
  156. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  157. exec {{ _cluster_pods.stdout.split(' ')[0] }}
  158. -c elasticsearch
  159. -n {{ openshift_logging_elasticsearch_namespace }}
  160. -- es_util --query=_cluster/settings -XPUT -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
  161. register: _enable_output
  162. changed_when:
  163. - "_enable_output.stdout != ''"
  164. - (_enable_output.stdout | from_json)['acknowledged'] | bool
  165. # Skip healthcheck for a full cluster restart always since it could take a long time to recover?
  166. - name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
  167. command: >
  168. {{ openshift_client_binary }}
  169. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  170. exec {{ _cluster_pods.stdout.split(' ')[0] }}
  171. -c elasticsearch
  172. -n {{ openshift_logging_elasticsearch_namespace }}
  173. -- es_cluster_health
  174. register: _pod_status
  175. until:
  176. - "_pod_status.stdout != ''"
  177. - (_pod_status.stdout | from_json)['status'] in ['green']
  178. retries: "{{ __elasticsearch_ready_retries }}"
  179. delay: 30
  180. changed_when: false
  181. failed_when: false
  182. - when:
  183. - _pod_status.failed is defined
  184. - _pod_status.failed
  185. run_once: true
  186. set_stats:
  187. data:
  188. installer_phase_logging:
  189. message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  190. # Reenable external communication for {{ _cluster_component }}
  191. - name: Reenable external communication for logging-{{ _cluster_component }}
  192. oc_service:
  193. state: present
  194. name: "logging-{{ _cluster_component }}"
  195. namespace: "{{ openshift_logging_elasticsearch_namespace }}"
  196. selector:
  197. component: "{{ _cluster_component }}"
  198. provider: openshift
  199. labels:
  200. logging-infra: 'support'
  201. ports:
  202. - port: 9200
  203. targetPort: "restapi"