restart_es_node.yml 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. ---
  2. # we want to ignore if this fails because it is possible to fail on the first node when installing -- should we not do this if we're skipping health checks? -- is this required for cluster sanity?
  3. - when: not _skip_healthcheck | bool
  4. name: "Disable shard balancing for logging-{{ _cluster_component }} cluster"
  5. command: >
  6. curl -s -k
  7. --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
  8. --key {{ _logging_handler_tempdir.stdout }}/admin-key
  9. -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
  10. -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }'
  11. register: _disable_output
  12. changed_when:
  13. - "_disable_output.stdout != ''"
  14. - (_disable_output.stdout | from_json)['acknowledged'] | bool
  15. failed_when: false
  16. - name: "Rolling out new pod(s) for {{ _es_node }}"
  17. command: >
  18. {{ openshift_client_binary }}
  19. --config={{ openshift.common.config_base }}/master/admin.kubeconfig
  20. rollout latest {{ _es_node }}
  21. -n {{ openshift_logging_elasticsearch_namespace }}
  22. - when: not _skip_healthcheck | bool
  23. name: "Waiting for {{ _es_node }} to finish scaling up"
  24. oc_obj:
  25. state: list
  26. name: "{{ _es_node }}"
  27. namespace: "{{ openshift_logging_elasticsearch_namespace }}"
  28. kind: dc
  29. register: _dc_output
  30. until:
  31. - _dc_output.results.results[0].status is defined
  32. - _dc_output.results.results[0].status.readyReplicas is defined
  33. - _dc_output.results.results[0].status.readyReplicas > 0
  34. - _dc_output.results.results[0].status.updatedReplicas is defined
  35. - _dc_output.results.results[0].status.updatedReplicas > 0
  36. retries: 60
  37. delay: 30
  38. failed_when: false
  39. - when:
  40. - _dc_output.failed is defined
  41. - _dc_output.failed
  42. run_once: true
  43. set_stats:
  44. data:
  45. installer_phase_logging:
  46. message: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  47. - when: not _skip_healthcheck | bool
  48. name: "Enable shard balancing for logging-{{ _cluster_component }} cluster"
  49. command: >
  50. curl -s -k
  51. --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
  52. --key {{ _logging_handler_tempdir.stdout }}/admin-key
  53. -XPUT 'https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/settings'
  54. -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
  55. register: _enable_output
  56. changed_when:
  57. - "_enable_output.stdout != ''"
  58. - (_enable_output.stdout | from_json)['acknowledged'] | bool
  59. # evaluate the RC for _dc_output
  60. - name: Evaluating status of rolled out pod
  61. assert:
  62. that: _dc_output.failed is undefined or not _dc_output.failed
  63. msg: "Node {{ _es_node}} in cluster logging-{{ _cluster_component }} was unable to rollout. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  64. - when: not _skip_healthcheck | bool
  65. name: "Waiting for ES node {{ _es_node }} health to be in ['green']"
  66. command: >
  67. curl -s -k
  68. --cert {{ _logging_handler_tempdir.stdout }}/admin-cert
  69. --key {{ _logging_handler_tempdir.stdout }}/admin-key
  70. https://logging-{{ _cluster_component }}.{{ openshift_logging_elasticsearch_namespace }}.svc:9200/_cluster/health?pretty
  71. register: _cluster_status
  72. until:
  73. - "_cluster_status.stdout != ''"
  74. - (_cluster_status.stdout | from_json)['status'] in ['green']
  75. retries: "{{ __elasticsearch_ready_retries }}"
  76. delay: 30
  77. changed_when: false
  78. failed_when: false
  79. # evaluate RC for _cluster_status
  80. - when:
  81. - _cluster_status.failed is defined
  82. - _cluster_status.failed
  83. run_once: true
  84. set_stats:
  85. data:
  86. installer_phase_logging:
  87. message: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."
  88. - name: Evaluating cluster health
  89. assert:
  90. that: _cluster_status.failed is undefined or not _cluster_status.failed
  91. msg: "Cluster logging-{{ _cluster_component }} was unable to recover to a green state. Please see documentation regarding recovering during a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart."