Browse Source

Merge pull request #7097 from ewolinetz/logging_fresh_lg_cluster_fix

Automatic merge from submit-queue.

Whenever we create a new es node ignore health checks, changing prome…

…theus pw gen for increased secret idempotency

Addresses https://bugzilla.redhat.com/show_bug.cgi?id=1540099

Whenever we are in a cluster sized > 1 the nodes required for recovery > 1. So when we have a fresh install we will not see the cluster start up because the number of required nodes is not met.

Whenever we are creating a new node, we do not wait for the health check so that the logging playbook can complete and we can roll out all updated nodes.

Also addresses prometheus pw generation so that each rerun of the playbook doesn't change the secret which triggers a full rollout of the cluster (assumes that keys/certs have changed).
OpenShift Merge Robot 7 years ago
parent
commit
b62c397f06

+ 2 - 0
roles/openshift_logging/tasks/install_logging.yaml

@@ -131,6 +131,7 @@
     openshift_logging_elasticsearch_storage_type: "{{ elasticsearch_storage_type | default(default_elasticsearch_storage_type) }}"
     openshift_logging_elasticsearch_pvc_pv_selector: "{{ openshift_logging_es_pv_selector }}"
     openshift_logging_elasticsearch_pvc_storage_class_name: "{{ openshift_logging_es_pvc_storage_class_name | default() }}"
+    __logging_scale_up: True
 
   with_sequence: count={{ openshift_logging_es_cluster_size | int - openshift_logging_facts.elasticsearch.deploymentconfigs.keys() | count }}
   loop_control:
@@ -221,6 +222,7 @@
     openshift_logging_es_hostname: "{{ openshift_logging_es_ops_hostname }}"
     openshift_logging_es_edge_term_policy: "{{ openshift_logging_es_ops_edge_term_policy | default('') }}"
     openshift_logging_es_allow_external: "{{ openshift_logging_es_ops_allow_external }}"
+    __logging_ops_scale_up: True
 
   with_sequence: count={{ openshift_logging_es_ops_cluster_size | int - openshift_logging_facts.elasticsearch_ops.deploymentconfigs.keys() | count }}
   loop_control:

+ 14 - 7
roles/openshift_logging_elasticsearch/tasks/main.yaml

@@ -138,15 +138,22 @@
   - "prometheus_out.stderr | length > 0"
   - "'already exists' not in prometheus_out.stderr"
 
-- set_fact:
-    _logging_metrics_proxy_passwd: "{{ 16 | lib_utils_oo_random_word | b64encode }}"
+- name: Checking for passwd.yml
+  stat: path="{{ generated_certs_dir }}/passwd.yml"
+  register: passwd_file
+  check_mode: no
 
-- template:
+- when: not passwd_file.stat.exists
+  template:
     src: passwd.j2
-    dest: "{{mktemp.stdout}}/passwd.yml"
+    dest: "{{ generated_certs_dir }}/passwd.yml"
   vars:
     logging_user_name: "{{ openshift_logging_elasticsearch_prometheus_sa }}"
-    logging_user_passwd: "{{ _logging_metrics_proxy_passwd }}"
+    logging_user_passwd: "{{ 16 | lib_utils_oo_random_word | b64encode }}"
+
+- slurp:
+    src: "{{ generated_certs_dir }}/passwd.yml"
+  register: _logging_metrics_proxy_passwd
 
 # View role and binding
 - name: Generate logging-elasticsearch-view-role
@@ -296,7 +303,7 @@
     - name: admin.jks
       path: "{{ generated_certs_dir }}/system.admin.jks"
     - name: passwd.yml
-      path: "{{mktemp.stdout}}/passwd.yml"
+      path: "{{ generated_certs_dir }}/passwd.yml"
 
 # services
 - name: Set logging-{{ es_component }}-cluster service
@@ -433,7 +440,7 @@
     es_container_security_context: "{{ _es_containers.elasticsearch.securityContext if _es_containers is defined and 'elasticsearch' in _es_containers and 'securityContext' in _es_containers.elasticsearch else None }}"
     deploy_type: "{{ openshift_logging_elasticsearch_deployment_type }}"
     es_replicas: 1
-    basic_auth_passwd: "{{ _logging_metrics_proxy_passwd | b64decode }}"
+    basic_auth_passwd: "{{ ( _logging_metrics_proxy_passwd['content'] | b64decode | from_yaml )[openshift_logging_elasticsearch_prometheus_sa]['passwd'] }}"
     es_number_of_shards: "{{ openshift_logging_es_number_of_shards | default(1) }}"
     es_number_of_replicas: "{{ openshift_logging_es_number_of_replicas| default(0) }}"
 

+ 7 - 0
roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml

@@ -65,6 +65,12 @@
       {{ openshift_client_binary }} get dc -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
     register: _cluster_dcs
 
+  # If we are currently restarting the "es" cluster we want to check if we are scaling up the number of es nodes
+  # If we are currently restarting the "es-ops" cluster we want to check if we are scaling up the number of ops nodes
+  # If we've created a new node for that cluster then the appropriate variable will be true, otherwise we default to false
+  - set_fact:
+      _skip_healthcheck: "{{ __logging_scale_up | default(false) if _cluster_component == 'es' else __logging_ops_scale_up | default(false) }}"
+
   ## restart all dcs for full restart
   - name: "Restart ES node {{ _es_node }}"
     include_tasks: restart_es_node.yml
@@ -94,6 +100,7 @@
       {{ openshift_client_binary }} exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
     register: _enable_output
     changed_when: "'\"acknowledged\":true' in _enable_output.stdout"
+    when: _cluster_pods.stdout != ""
 
   # Reenable external communication for {{ _cluster_component }}
   - name: Reenable external communication for logging-{{ _cluster_component }}

+ 6 - 3
roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml

@@ -3,7 +3,8 @@
   command: >
     {{ openshift_client_binary }} rollout latest {{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }}
 
-- name: "Waiting for {{ _es_node }} to finish scaling up"
+- when: not _skip_healthcheck | bool
+  name: "Waiting for {{ _es_node }} to finish scaling up"
   oc_obj:
     state: list
     name: "{{ _es_node }}"
@@ -19,12 +20,14 @@
   retries: 60
   delay: 30
 
-- name: Gettings name(s) of replica pod(s)
+- when: not _skip_healthcheck | bool
+  name: Gettings name(s) of replica pod(s)
   command: >
     {{ openshift_client_binary }} get pods -l deploymentconfig={{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
   register: _pods
 
-- name: "Waiting for ES to be ready for {{ _es_node }}"
+- when: not _skip_healthcheck | bool
+  name: "Waiting for ES to be ready for {{ _es_node }}"
   shell: >
     {{ openshift_client_binary }} exec "{{ _pod }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health
   with_items: "{{ _pods.stdout.split(' ') }}"