Pārlūkot izejas kodu

Gather debug data on task failure

Adds tasks to gather service status and node detail when node Ready
checks fail or when CSR approval fails.

Changed --config to --kubeconfig due to deprecation in `oc`. This was
causing an issue with CSR approval shell code.
Russell Teague 5 gadi atpakaļ
vecāks
revīzija
c829225c9f

+ 1 - 1
images/installer/root/usr/local/bin/generate

@@ -82,7 +82,7 @@ class OpenShiftClient:
         """Execute a remote call using `oc`"""
         cmd = [
             self.oc,
-            '--config',
+            '--kubeconfig',
             self.kubeconfig
         ] + shlex.split(cmd_str)
         try:

+ 24 - 15
roles/openshift_node/tasks/apply_machine_config.yml

@@ -7,7 +7,7 @@
 - name: Get worker machine current config name
   command: >
     oc get node {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.metadata.annotations.machineconfiguration\.openshift\.io/desiredConfig}'
   delegate_to: localhost
   register: oc_get
@@ -24,7 +24,7 @@
 - name: Get worker ignition config
   command: >
     oc get machineconfig {{ l_worker_machine_config_name }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=json
   delegate_to: localhost
   register: oc_get
@@ -42,7 +42,7 @@
 - name: Get machine-config-operator image
   command: >
     oc get daemonset machine-config-daemon
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --namespace=openshift-machine-config-operator
     --output=jsonpath='{.spec.template.spec.containers[?(@.name=="machine-config-daemon")].image}'
   delegate_to: localhost
@@ -83,15 +83,24 @@
   reboot:
   #  reboot_timeout: 600  # default, 10 minutes
 
-- name: Wait for nodes to report ready
-  command: >
-    oc get node {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
-    --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get.stdout == "True"
-  retries: 36
-  delay: 5
+- block:
+  - name: Wait for nodes to report ready
+    command: >
+      oc get node {{ ansible_nodename | lower }}
+      --kubeconfig={{ openshift_node_kubeconfig_path }}
+      --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
+    delegate_to: localhost
+    register: oc_get
+    until:
+    - oc_get.stdout == "True"
+    retries: 36
+    delay: 5
+    changed_when: false
+
+  rescue:
+  - import_tasks: gather_debug.yml
+
+  - name: DEBUG - Node failed to report ready
+    fail:
+      msg: "Node failed to report ready"
+    delegate_to: localhost

+ 88 - 60
roles/openshift_node/tasks/config.yml

@@ -89,7 +89,7 @@
 - name: Get cluster pull-secret
   command: >
     oc get secret pull-secret
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --namespace=openshift-config
     --output=jsonpath='{.data.\.dockerconfigjson}'
   delegate_to: localhost
@@ -107,7 +107,7 @@
 - name: Get cluster release image
   command: >
     oc get clusterversion
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.items[0].status.desired.image}'
   delegate_to: localhost
   register: oc_get
@@ -162,67 +162,95 @@
   - fail:
       msg: "Ignition apply failed"
 
-- name: Approve node-bootstrapper CSR
-  shell: >
-    count=0;
-    for csr in `oc --config={{ openshift_node_kubeconfig_path }} get csr --no-headers \
-      | grep " system:serviceaccount:openshift-machine-config-operator:node-bootstrapper " \
-      | cut -d " " -f1`;
-    do
-      oc --config={{ openshift_node_kubeconfig_path }} describe csr/$csr \
-        | grep " system:node:{{ hostvars[item].ansible_nodename | lower }}$";
-      if [ $? -eq 0 ];
-      then
-        oc --config={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
+- block:
+  - name: Approve node-bootstrapper CSR
+    shell: >
+      count=0;
+      for csr in `oc --kubeconfig={{ openshift_node_kubeconfig_path }} get csr --no-headers \
+        | grep " system:serviceaccount:openshift-machine-config-operator:node-bootstrapper " \
+        | cut -d " " -f1`;
+      do
+        oc --kubeconfig={{ openshift_node_kubeconfig_path }} describe csr/$csr \
+          | grep " system:node:{{ hostvars[item].ansible_nodename | lower }}$";
+        if [ $? -eq 0 ];
+        then
+          oc --kubeconfig={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
+          if [ $? -eq 0 ];
+          then
+            count=$((count+1));
+          fi;
+        fi;
+      done;
+      exit $((!count));
+    loop: "{{ ansible_play_batch }}"
+    delegate_to: localhost
+    run_once: true
+    register: oc_get
+    until:
+    - oc_get is success
+    retries: 6
+    delay: 5
+
+  rescue:
+  - import_tasks: gather_debug.yml
+
+  - name: DEBUG - Failed to approve node-bootstrapper CSR
+    fail:
+      msg: "Failed to approve node-bootstrapper CSR"
+    delegate_to: localhost
+
+- block:
+  - name: Approve node CSR
+    shell: >
+      count=0;
+      for csr in `oc --kubeconfig={{ openshift_node_kubeconfig_path }} get csr --no-headers \
+        | grep " system:node:{{ hostvars[item].ansible_nodename | lower }} " \
+        | cut -d " " -f1`;
+      do
+        oc --kubeconfig={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
         if [ $? -eq 0 ];
         then
           count=$((count+1));
         fi;
-      fi;
-    done;
-    exit $((!count));
-  loop: "{{ ansible_play_batch }}"
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get is success
-  retries: 6
-  delay: 5
+      done;
+      exit $((!count));
+    loop: "{{ ansible_play_batch }}"
+    delegate_to: localhost
+    run_once: true
+    register: oc_get
+    until:
+    - oc_get is success
+    retries: 6
+    delay: 5
 
-- name: Approve node CSR
-  shell: >
-    count=0;
-    for csr in `oc --config={{ openshift_node_kubeconfig_path }} get csr --no-headers \
-      | grep " system:node:{{ hostvars[item].ansible_nodename | lower }} " \
-      | cut -d " " -f1`;
-    do
-      oc --config={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
-      if [ $? -eq 0 ];
-      then
-        count=$((count+1));
-      fi;
-    done;
-    exit $((!count));
-  loop: "{{ ansible_play_batch }}"
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get is success
-  retries: 6
-  delay: 5
+  rescue:
+  - import_tasks: gather_debug.yml
 
-- name: Wait for nodes to report ready
-  command: >
-    oc get node {{ hostvars[item].ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
-    --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
-  loop: "{{ ansible_play_batch }}"
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get.stdout == "True"
-  retries: 36
-  delay: 5
+  - name: DEBUG - Failed to approve node CSR
+    fail:
+      msg: "Failed to approve node CSR"
+    delegate_to: localhost
+
+- block:
+  - name: Wait for nodes to report ready
+    command: >
+      oc get node {{ hostvars[item].ansible_nodename | lower }}
+      --kubeconfig={{ openshift_node_kubeconfig_path }}
+      --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
+    loop: "{{ ansible_play_batch }}"
+    delegate_to: localhost
+    run_once: true
+    register: oc_get
+    until:
+    - oc_get.stdout == "True"
+    retries: 36
+    delay: 5
+    changed_when: false
+
+  rescue:
+  - import_tasks: gather_debug.yml
+
+  - name: DEBUG - Node failed to report ready
+    fail:
+      msg: "Node failed to report ready"
+    delegate_to: localhost

+ 23 - 0
roles/openshift_node/tasks/gather_debug.yml

@@ -0,0 +1,23 @@
+---
+
+- name: Gather Debug - Get service status
+  command: >
+    systemctl status {{ item }}
+  changed_when: false
+  ignore_errors: true
+  register: systemctl_status
+  loop:
+  - cri-o
+  - kubelet
+
+- name: Gather Debug - Get complete node objects
+  command: >
+    oc get node {{ hostvars[item].ansible_nodename | lower }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
+    --output=json
+  loop: "{{ ansible_play_batch }}"
+  delegate_to: localhost
+  run_once: true
+  changed_when: false
+  ignore_errors: true
+  register: oc_get

+ 2 - 2
roles/openshift_node/tasks/install.yml

@@ -12,7 +12,7 @@
 - name: Get cluster version
   command: >
     oc get clusterversion
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.items[0].status.desired.version}'
   delegate_to: localhost
   register: oc_get
@@ -32,7 +32,7 @@
 - name: Get kubernetes server version
   command: >
     oc version
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=json
   delegate_to: localhost
   register: oc_get

+ 4 - 4
roles/openshift_node/tasks/proxy.yml

@@ -2,7 +2,7 @@
 - name: Check for cluster http proxy
   command: >
     oc get proxies.config.openshift.io cluster
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.status.httpProxy}'
   register: oc_get_http_proxy
   delegate_to: localhost
@@ -15,7 +15,7 @@
 - name: Check for cluster https proxy
   command: >
     oc get proxies.config.openshift.io cluster
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.status.httpsProxy}'
   register: oc_get_https_proxy
   delegate_to: localhost
@@ -28,7 +28,7 @@
 - name: Check for cluster no proxy
   command: >
     oc get proxies.config.openshift.io cluster
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.status.noProxy}'
   register: oc_get_no_proxy
   delegate_to: localhost
@@ -41,7 +41,7 @@
 - name: Check for additional trust bundle
   command: >
     oc get configmap user-ca-bundle -n openshift-config
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.data.ca-bundle\.crt}'
   register: oc_get_additional_trust_bundle
   delegate_to: localhost

+ 1 - 1
roles/openshift_node/tasks/scaleup_checks.yml

@@ -11,7 +11,7 @@
 - name: Get cluster nodes
   command: >
     oc get nodes
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=name
   register: oc_get
   until:

+ 3 - 3
roles/openshift_node/tasks/upgrade.yml

@@ -8,13 +8,13 @@
 - name: Cordon node prior to upgrade
   command: >
     oc adm cordon {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
   delegate_to: localhost
 
 - name: Drain node prior to upgrade
   command: >
     oc adm drain {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --force --delete-local-data --ignore-daemonsets
   delegate_to: localhost
 
@@ -41,7 +41,7 @@
 - name: Uncordon node after upgrade
   command: >
     oc adm uncordon {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
   delegate_to: localhost
 
 # Run the openshift_node_post_upgrade_hook if defined

+ 2 - 2
test/aws/scaleup.yml

@@ -87,13 +87,13 @@
   - name: Mark CoreOS nodes as unschedulable
     command: >
       oc adm cordon {{ item | lower }}
-      --config={{ kubeconfig_path }}
+      --kubeconfig={{ kubeconfig_path }}
     with_items: "{{ pre_scaleup_workers_name }}"
 
   - name: Drain CoreOS nodes
     command: >
       oc adm drain {{ item | lower }}
-      --config={{ kubeconfig_path }}
+      --kubeconfig={{ kubeconfig_path }}
       --force --delete-local-data --ignore-daemonsets
       --timeout=0s
     with_items: "{{ pre_scaleup_workers_name }}"