Browse Source

Merge pull request #8563 from vrutkovs/control-plane-healthcheck-pods

control plane: ensure all control pods have heathchecks passed
Vadim Rutkovsky 6 years ago
parent
commit
ed775ec280

+ 2 - 2
roles/etcd/files/etcd.yaml

@@ -32,8 +32,8 @@ spec:
        name: master-data
     livenessProbe:
       exec:
-      initialDelaySeconds: 15
-      timeoutSeconds: 10
+      initialDelaySeconds: 45
+    terminationGracePeriodSeconds: 0
   volumes:
   - name: master-config
     hostPath:

+ 14 - 0
roles/openshift_control_plane/defaults/main.yml

@@ -152,3 +152,17 @@ l_new_config_clusterNetworks:
 # added so that it can lay down the static pod definitions in a configurable place
 openshift_control_plane_static_pod_location: /etc/origin/node/pods/
 openshift_control_plane_apply_cluster_signing_config: True
+
+l_core_api_list:
+- "apps.openshift.io"
+- "authorization.openshift.io"
+- "build.openshift.io"
+- "image.openshift.io"
+- "network.openshift.io"
+- "oauth.openshift.io"
+- "project.openshift.io"
+- "quota.openshift.io"
+- "route.openshift.io"
+- "security.openshift.io"
+- "template.openshift.io"
+- "user.openshift.io"

+ 97 - 24
roles/openshift_control_plane/tasks/main.yml

@@ -183,23 +183,26 @@
   - fail:
       msg: Node start failed.
 
-- name: Verify that the control plane is running
-  command: >
-    curl -k {{ openshift.master.api_url }}/healthz/ready
-  args:
-    # Disables the following warning:
-    # Consider using get_url or uri module rather than running curl
-    warn: no
-  register: control_plane_health
-  until: control_plane_health.stdout == 'ok'
+- name: Wait for control plane pods to appear
+  oc_obj:
+    state: list
+    kind: pod
+    name: "master-{{ item }}-{{ openshift.node.nodename | lower }}"
+    namespace: kube-system
+  register: control_plane_pods
+  until:
+  - "'results' in control_plane_pods"
+  - "'results' in control_plane_pods.results"
+  - control_plane_pods.results.results | length > 0
   retries: 60
   delay: 5
-  changed_when: false
-  # Ignore errors so we can log troubleshooting info on failures.
-  ignore_errors: yes
+  with_items:
+  - "{{ 'etcd' if inventory_hostname in groups['oo_etcd_to_config'] else omit }}"
+  - api
+  - controllers
+  ignore_errors: true
 
-# Capture debug output here to simplify triage
-- when: control_plane_health.stdout != 'ok'
+- when: control_plane_pods is failed
   block:
   - name: Check status in the kube-system namespace
     command: >
@@ -211,10 +214,10 @@
   - name: Get pods in the kube-system namespace
     command: >
       {{ openshift_client_binary }} get pods --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n kube-system -o wide
-    register: control_plane_pods
+    register: control_plane_pods_list
     ignore_errors: true
   - debug:
-      msg: "{{ control_plane_pods.stdout_lines }}"
+      msg: "{{ control_plane_pods_list.stdout_lines }}"
   - name: Get events in the kube-system namespace
     command: >
       {{ openshift_client_binary }} get events --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n kube-system
@@ -222,6 +225,54 @@
     ignore_errors: true
   - debug:
       msg: "{{ control_plane_events.stdout_lines }}"
+  - name: Get node logs
+    command: journalctl --no-pager -n 300 -u {{ openshift_service_type }}-node
+    register: logs_node
+    ignore_errors: true
+  - debug:
+      msg: "{{ logs_node.stdout_lines }}"
+  - name: Report control plane errors
+    fail:
+      msg: Control plane pods didn't come up
+
+- name: Wait for all control plane pods to become ready
+  oc_obj:
+    state: list
+    kind: pod
+    name: "master-{{ item }}-{{ openshift.node.nodename | lower }}"
+    namespace: kube-system
+  register: control_plane_health
+  until:
+  - "'results' in control_plane_health"
+  - "'results' in control_plane_health.results"
+  - control_plane_health.results.results | length > 0
+  - "'status' in control_plane_health.results.results[0]"
+  - "'conditions' in control_plane_health.results.results[0].status"
+  - control_plane_health.results.results[0].status.conditions | selectattr('type', 'match', '^Ready$') | map(attribute='status') | join | bool == True
+  retries: 60
+  delay: 5
+  with_items:
+  - "{{ 'etcd' if inventory_hostname in groups['oo_etcd_to_config'] else omit }}"
+  - api
+  - controllers
+
+- when: control_plane_health is failed
+  block:
+  - debug:
+      msg: "{{ control_plane_pods_list.stdout_lines }}"
+  - name: Get events in the kube-system namespace
+    command: >
+      {{ openshift_client_binary }} get events --config={{ openshift.common.config_base }}/master/admin.kubeconfig -n kube-system
+    register: control_plane_events
+    ignore_errors: true
+  - debug:
+      msg: "{{ control_plane_events.stdout_lines }}"
+  - name: Get node logs
+    command: journalctl --no-pager -n 300 -u {{ openshift_service_type }}-node
+    register: logs_node
+    ignore_errors: true
+  - debug:
+      msg: "{{ logs_node.stdout_lines }}"
   - name: Get API logs
     command: >
       /usr/local/bin/master-logs api api
@@ -229,14 +280,36 @@
     ignore_errors: true
   - debug:
       msg: "{{ control_plane_logs_api.stdout_lines }}"
-  - name: Get node logs
-    command: journalctl --no-pager -n 300 -u {{ openshift_service_type }}-node
-    register: control_plane_logs_node
+  - name: Get controllers logs
+    command: >
+      /usr/local/bin/master-logs controllers controllers
+    register: control_plane_logs_controllers
+    ignore_errors: true
+  - debug:
+      msg: "{{ control_plane_logs_controllers.stdout_lines }}"
+  - name: Get etcd logs
+    command: >
+      /usr/local/bin/master-logs etcd etcd
+    register: control_plane_logs_etcd
+    when: inventory_hostname in groups['oo_etcd_to_config']
     ignore_errors: true
   - debug:
-      msg: "{{ control_plane_logs_node.stdout_lines }}"
+      msg: "{{ control_plane_logs_controllers.stdout_lines }}"
+    when: inventory_hostname in groups['oo_etcd_to_config']
+  - name: Report control plane errors
+    fail:
+      msg: Control plane pods didn't pass health check
 
-- name: Report control plane errors
-  fail:
-    msg: Control plane install failed.
-  when: control_plane_health.stdout != 'ok'
+- name: Wait for Openshift APIs to register themselves
+  command: >
+    {{ openshift_client_binary }} get --raw /apis/{{ item }}/v1
+  register: openshift_apis
+  until: openshift_apis.rc == 0
+  with_items: "{{ l_core_api_list }}"
+  retries: 60
+  delay: 5
+
+- name: Remove oc cache to refresh a list of APIs
+  file:
+    path: "~/.kube/cache"
+    state: absent