restart_es_node.yml 4.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. ---
  2. # we want to ignore if this fails because it is possible to fail on the first node when installing -- should we not do this if we're skipping health checks? -- is this required for cluster sanity?
  3. - name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
  4. command: >
  5. curl -s -k
  6. --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
  7. --key {{ _logging_handler_tempdir.stdout }}/admin-key
  8. -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
  9. -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
  10. register: _disable_output
  11. changed_when:
  12. - "_disable_output.stdout != ''"
  13. - (_disable_output.stdout | from_json)['acknowledged'] | bool
  14. failed_when: false
  15. - name: "Rolling out new pod(s) for {{ _es_node }}"
  16. command: >
  17. {{ openshift_client_binary }}
  18. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  19. rollout latest {{ _es_node }}
  20. -n {{ openshift_logging_elasticsearch_namespace }}
  21. # always wait for this to scale up
  22. - name: "Waiting for {{ _es_node }} to finish scaling up"
  23. oc_obj:
  24. state: list
  25. name: "{{ _es_node }}"
  26. namespace: "{{ openshift_logging_elasticsearch_namespace }}"
  27. kind: dc
  28. register: _dc_output
  29. until:
  30. - _dc_output.results.results[0].status is defined
  31. - _dc_output.results.results[0].status.readyReplicas is defined
  32. - _dc_output.results.results[0].status.readyReplicas > 0
  33. - _dc_output.results.results[0].status.updatedReplicas is defined
  34. - _dc_output.results.results[0].status.updatedReplicas > 0
  35. retries: 60
  36. delay: 30
  37. failed_when: false
  38. - when:
  39. - _dc_output.failed is defined
  40. - _dc_output.failed
  41. run_once: true
  42. set_stats:
  43. data:
  44. installer_phase_logging:
  45. message: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  46. - name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
  47. command: >
  48. curl -s -k
  49. --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
  50. --key {{ _logging_handler_tempdir.stdout }}/admin-key
  51. -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
  52. -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
  53. register: _enable_output
  54. changed_when:
  55. - "_enable_output.stdout != ''"
  56. - (_enable_output.stdout | from_json)['acknowledged'] | bool
  57. # evaluate the RC for _dc_output
  58. - name: Evaluating status of rolled out pod
  59. assert:
  60. that: not _dc_output.failed
  61. msg: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  62. - when: not _skip_healthcheck | bool
  63. name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
  64. command: >
  65. curl -s -k
  66. --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
  67. --key {{ _logging_handler_tempdir.stdout }}/admin-key
  68. https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
  69. register: _pod_status
  70. until:
  71. - "_pod_status.stdout != ''"
  72. - (_pod_status.stdout | from_json)['status'] in ['green']
  73. retries: "{{ __elasticsearch_ready_retries }}"
  74. delay: 30
  75. changed_when: false
  76. failed_when: false
  77. # evaluate RC for _pod_status
  78. - when:
  79. - _pod_status.failed is defined
  80. - _pod_status.failed
  81. run_once: true
  82. set_stats:
  83. data:
  84. installer_phase_logging:
  85. message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  86. - name: Evaluating cluster health
  87. assert:
  88. that: not _pod_status.failed
  89. msg: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."