Pārlūkot izejas kodu

Merge pull request #12099 from mtnbikenc/node-ready-detail

Bug 1779811: Gather debug data on task failure
OpenShift Merge Robot 5 gadi atpakaļ
vecāks
revīzija
9a6380d9b0

+ 1 - 1
images/installer/root/usr/local/bin/generate

@@ -82,7 +82,7 @@ class OpenShiftClient:
         """Execute a remote call using `oc`"""
         cmd = [
             self.oc,
-            '--config',
+            '--kubeconfig',
             self.kubeconfig
         ] + shlex.split(cmd_str)
         try:

+ 1 - 1
roles/openshift_node/defaults/main.yml

@@ -6,7 +6,7 @@ openshift_node_tls_verify: false
 openshift_node_kubeconfig_path: "{{ openshift_kubeconfig_path | default('~/.kube/config') | expanduser | realpath }}"
 openshift_node_kubeconfig: "{{ lookup('file', openshift_node_kubeconfig_path) | from_yaml }}"
 openshift_node_bootstrap_port: 22623
-openshift_node_bootstrap_server: "{{ openshift_node_kubeconfig.clusters.0.cluster.server.split(':')[0:-1] | join(':') }}:{{ openshift_node_bootstrap_port }}"
+openshift_node_bootstrap_server: "{{ openshift_node_kubeconfig.clusters.0.cluster.server.split(':')[0:-1] | join(':') | regex_replace('://api', '://api-int') }}:{{ openshift_node_bootstrap_port }}"
 openshift_node_bootstrap_endpoint: "{{ openshift_node_bootstrap_server }}/config/{{ openshift_node_machineconfigpool }}"
 
 openshift_node_packages:

+ 24 - 15
roles/openshift_node/tasks/apply_machine_config.yml

@@ -7,7 +7,7 @@
 - name: Get worker machine current config name
   command: >
     oc get node {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.metadata.annotations.machineconfiguration\.openshift\.io/desiredConfig}'
   delegate_to: localhost
   register: oc_get
@@ -24,7 +24,7 @@
 - name: Get worker ignition config
   command: >
     oc get machineconfig {{ l_worker_machine_config_name }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=json
   delegate_to: localhost
   register: oc_get
@@ -42,7 +42,7 @@
 - name: Get machine-config-operator image
   command: >
     oc get daemonset machine-config-daemon
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --namespace=openshift-machine-config-operator
     --output=jsonpath='{.spec.template.spec.containers[?(@.name=="machine-config-daemon")].image}'
   delegate_to: localhost
@@ -83,15 +83,24 @@
   reboot:
   #  reboot_timeout: 600  # default, 10 minutes
 
-- name: Wait for nodes to report ready
-  command: >
-    oc get node {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
-    --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get.stdout == "True"
-  retries: 36
-  delay: 5
+- block:
+  - name: Wait for nodes to report ready
+    command: >
+      oc get node {{ ansible_nodename | lower }}
+      --kubeconfig={{ openshift_node_kubeconfig_path }}
+      --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
+    delegate_to: localhost
+    register: oc_get
+    until:
+    - oc_get.stdout == "True"
+    retries: 36
+    delay: 5
+    changed_when: false
+
+  rescue:
+  - import_tasks: gather_debug.yml
+
+  - name: DEBUG - Node failed to report ready
+    fail:
+      msg: "Node failed to report ready"
+    delegate_to: localhost

+ 88 - 60
roles/openshift_node/tasks/config.yml

@@ -89,7 +89,7 @@
 - name: Get cluster pull-secret
   command: >
     oc get secret pull-secret
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --namespace=openshift-config
     --output=jsonpath='{.data.\.dockerconfigjson}'
   delegate_to: localhost
@@ -107,7 +107,7 @@
 - name: Get cluster release image
   command: >
     oc get clusterversion
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.items[0].status.desired.image}'
   delegate_to: localhost
   register: oc_get
@@ -162,67 +162,95 @@
   - fail:
       msg: "Ignition apply failed"
 
-- name: Approve node-bootstrapper CSR
-  shell: >
-    count=0;
-    for csr in `oc --config={{ openshift_node_kubeconfig_path }} get csr --no-headers \
-      | grep " system:serviceaccount:openshift-machine-config-operator:node-bootstrapper " \
-      | cut -d " " -f1`;
-    do
-      oc --config={{ openshift_node_kubeconfig_path }} describe csr/$csr \
-        | grep " system:node:{{ hostvars[item].ansible_nodename | lower }}$";
-      if [ $? -eq 0 ];
-      then
-        oc --config={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
+- block:
+  - name: Approve node-bootstrapper CSR
+    shell: >
+      count=0;
+      for csr in `oc --kubeconfig={{ openshift_node_kubeconfig_path }} get csr --no-headers \
+        | grep " system:serviceaccount:openshift-machine-config-operator:node-bootstrapper " \
+        | cut -d " " -f1`;
+      do
+        oc --kubeconfig={{ openshift_node_kubeconfig_path }} describe csr/$csr \
+          | grep " system:node:{{ hostvars[item].ansible_nodename | lower }}$";
+        if [ $? -eq 0 ];
+        then
+          oc --kubeconfig={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
+          if [ $? -eq 0 ];
+          then
+            count=$((count+1));
+          fi;
+        fi;
+      done;
+      exit $((!count));
+    loop: "{{ ansible_play_batch }}"
+    delegate_to: localhost
+    run_once: true
+    register: oc_get
+    until:
+    - oc_get is success
+    retries: 6
+    delay: 5
+
+  rescue:
+  - import_tasks: gather_debug.yml
+
+  - name: DEBUG - Failed to approve node-bootstrapper CSR
+    fail:
+      msg: "Failed to approve node-bootstrapper CSR"
+    delegate_to: localhost
+
+- block:
+  - name: Approve node CSR
+    shell: >
+      count=0;
+      for csr in `oc --kubeconfig={{ openshift_node_kubeconfig_path }} get csr --no-headers \
+        | grep " system:node:{{ hostvars[item].ansible_nodename | lower }} " \
+        | cut -d " " -f1`;
+      do
+        oc --kubeconfig={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
         if [ $? -eq 0 ];
         then
           count=$((count+1));
         fi;
-      fi;
-    done;
-    exit $((!count));
-  loop: "{{ ansible_play_batch }}"
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get is success
-  retries: 6
-  delay: 5
+      done;
+      exit $((!count));
+    loop: "{{ ansible_play_batch }}"
+    delegate_to: localhost
+    run_once: true
+    register: oc_get
+    until:
+    - oc_get is success
+    retries: 6
+    delay: 5
 
-- name: Approve node CSR
-  shell: >
-    count=0;
-    for csr in `oc --config={{ openshift_node_kubeconfig_path }} get csr --no-headers \
-      | grep " system:node:{{ hostvars[item].ansible_nodename | lower }} " \
-      | cut -d " " -f1`;
-    do
-      oc --config={{ openshift_node_kubeconfig_path }} adm certificate approve ${csr};
-      if [ $? -eq 0 ];
-      then
-        count=$((count+1));
-      fi;
-    done;
-    exit $((!count));
-  loop: "{{ ansible_play_batch }}"
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get is success
-  retries: 6
-  delay: 5
+  rescue:
+  - import_tasks: gather_debug.yml
 
-- name: Wait for nodes to report ready
-  command: >
-    oc get node {{ hostvars[item].ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
-    --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
-  loop: "{{ ansible_play_batch }}"
-  delegate_to: localhost
-  run_once: true
-  register: oc_get
-  until:
-  - oc_get.stdout == "True"
-  retries: 36
-  delay: 5
+  - name: DEBUG - Failed to approve node CSR
+    fail:
+      msg: "Failed to approve node CSR"
+    delegate_to: localhost
+
+- block:
+  - name: Wait for nodes to report ready
+    command: >
+      oc get node {{ hostvars[item].ansible_nodename | lower }}
+      --kubeconfig={{ openshift_node_kubeconfig_path }}
+      --output=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
+    loop: "{{ ansible_play_batch }}"
+    delegate_to: localhost
+    run_once: true
+    register: oc_get
+    until:
+    - oc_get.stdout == "True"
+    retries: 36
+    delay: 5
+    changed_when: false
+
+  rescue:
+  - import_tasks: gather_debug.yml
+
+  - name: DEBUG - Node failed to report ready
+    fail:
+      msg: "Node failed to report ready"
+    delegate_to: localhost

+ 23 - 0
roles/openshift_node/tasks/gather_debug.yml

@@ -0,0 +1,23 @@
+---
+
+- name: Gather Debug - Get service status
+  command: >
+    systemctl status {{ item }}
+  changed_when: false
+  ignore_errors: true
+  register: systemctl_status
+  loop:
+  - cri-o
+  - kubelet
+
+- name: Gather Debug - Get complete node objects
+  command: >
+    oc get node {{ hostvars[item].ansible_nodename | lower }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
+    --output=json
+  loop: "{{ ansible_play_batch }}"
+  delegate_to: localhost
+  run_once: true
+  changed_when: false
+  ignore_errors: true
+  register: oc_get

+ 2 - 2
roles/openshift_node/tasks/install.yml

@@ -12,7 +12,7 @@
 - name: Get cluster version
   command: >
     oc get clusterversion
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.items[0].status.desired.version}'
   delegate_to: localhost
   register: oc_get
@@ -32,7 +32,7 @@
 - name: Get kubernetes server version
   command: >
     oc version
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=json
   delegate_to: localhost
   register: oc_get

+ 4 - 4
roles/openshift_node/tasks/proxy.yml

@@ -2,7 +2,7 @@
 - name: Check for cluster http proxy
   command: >
     oc get proxies.config.openshift.io cluster
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.status.httpProxy}'
   register: oc_get_http_proxy
   delegate_to: localhost
@@ -15,7 +15,7 @@
 - name: Check for cluster https proxy
   command: >
     oc get proxies.config.openshift.io cluster
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.status.httpsProxy}'
   register: oc_get_https_proxy
   delegate_to: localhost
@@ -28,7 +28,7 @@
 - name: Check for cluster no proxy
   command: >
     oc get proxies.config.openshift.io cluster
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.status.noProxy}'
   register: oc_get_no_proxy
   delegate_to: localhost
@@ -41,7 +41,7 @@
 - name: Check for additional trust bundle
   command: >
     oc get configmap user-ca-bundle -n openshift-config
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=jsonpath='{.data.ca-bundle\.crt}'
   register: oc_get_additional_trust_bundle
   delegate_to: localhost

+ 1 - 1
roles/openshift_node/tasks/scaleup_checks.yml

@@ -11,7 +11,7 @@
 - name: Get cluster nodes
   command: >
     oc get nodes
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --output=name
   register: oc_get
   until:

+ 3 - 3
roles/openshift_node/tasks/upgrade.yml

@@ -8,13 +8,13 @@
 - name: Cordon node prior to upgrade
   command: >
     oc adm cordon {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
   delegate_to: localhost
 
 - name: Drain node prior to upgrade
   command: >
     oc adm drain {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
     --force --delete-local-data --ignore-daemonsets
   delegate_to: localhost
 
@@ -41,7 +41,7 @@
 - name: Uncordon node after upgrade
   command: >
     oc adm uncordon {{ ansible_nodename | lower }}
-    --config={{ openshift_node_kubeconfig_path }}
+    --kubeconfig={{ openshift_node_kubeconfig_path }}
   delegate_to: localhost
 
 # Run the openshift_node_post_upgrade_hook if defined

+ 45 - 23
test/aws/create_machineset.yml

@@ -12,7 +12,6 @@
         name: "{{ machineset_name }}"
         resourceVersion: ""
       spec:
-        replicas: 1
         selector:
           matchLabels:
             machine.openshift.io/cluster-api-machineset: "{{ machineset_name }}"
@@ -33,29 +32,51 @@
     definition: "{{ machineset | to_yaml }}"
 
 - block:
-  - name: Wait for machine to be created
-    k8s_facts:
-      api_version: machine.openshift.io/v1beta1
-      kubeconfig: "{{ kubeconfig_path }}"
-      namespace: openshift-machine-api
-      kind: Machine
-      label_selectors:
-      - "machine.openshift.io/cluster-api-machineset={{ machineset_name }}"
+  - name: Get machines in the machineset
+    command: >
+      oc get machine
+      --kubeconfig={{ kubeconfig_path }}
+      --namespace=openshift-machine-api
+      --selector='machine.openshift.io/cluster-api-machineset={{ machineset_name }}'
+      --output=json
+    register: oc_get_machine
+    changed_when: false
+
+  - name: Create list of machines
+    set_fact:
+      worker_machines: "{{ (oc_get_machine.stdout | from_json)['items'] | map(attribute='metadata.name') | list }}"
+
+  - name: Wait for machines to be provisioned
+    command: >
+      oc get machine {{ item }}
+      --kubeconfig={{ kubeconfig_path }}
+      --namespace=openshift-machine-api
+      --output=json
+    loop: "{{ worker_machines }}"
     register: new_machine
+    until:
+    - new_machine.stdout != ''
+    - (new_machine.stdout | from_json).status is defined
+    - (new_machine.stdout | from_json).status.phase == 'Provisioned'
     retries: 36
     delay: 5
-    until:
-    - new_machine.resources is defined
-    - new_machine.resources | length > 0
-    - new_machine.resources[0].status is defined
-    - new_machine.resources[0].status.providerStatus is defined
-    - new_machine.resources[0].status.providerStatus.instanceState is defined
-    - new_machine.resources[0].status.providerStatus.instanceState == 'running'
-    failed_when:
-    - new_machine.resources is defined
-    - new_machine.resources | length > 0
-    - new_machine.resources[0].status is defined
-    - new_machine.resources[0].status.phase == 'Failed'
+    changed_when: false
+
+  - name: Get machines in the machineset after provisioning
+    command: >
+      oc get machine
+      --kubeconfig={{ kubeconfig_path }}
+      --namespace=openshift-machine-api
+      --selector='machine.openshift.io/cluster-api-machineset={{ machineset_name }}'
+      --output=json
+    register: oc_get_machine
+    changed_when: false
+
+  - name: Add hostname to new_workers_list
+    set_fact:
+      new_workers_list: "{{ new_workers_list + [ item.status.addresses | selectattr('type', 'match', '^InternalDNS$') | map(attribute='address') | first ] }}"
+    loop: "{{ (oc_get_machine.stdout | from_json)['items'] }}"
+
   rescue:
   - name: Machine creation failed
     fail:
@@ -63,7 +84,8 @@
 
 - name: Add machine to the inventory
   add_host:
-    name: "{{ new_machine.resources[0].status.addresses | selectattr('type', 'match', '^InternalIP$') | map(attribute='address') | first }}"
-    node_name: "{{ new_machine.resources[0].status.addresses | selectattr('type', 'match', '^InternalDNS$') | map(attribute='address') | first }}"
+    name: "{{ item }}"
+    node_name: "{{ item }}"
     groups: new_workers
     ansible_ssh_common_args: "-o ProxyCommand=\"ssh -o IdentityFile='{{ openshift_aws_scaleup_key_path | default('/opt/app-root/src/.ssh/id_rsa') }}' -o ConnectTimeout=30 -o ConnectionAttempts=100 -o StrictHostKeyChecking=no -W %h:%p -q core@{{ ssh_bastion }}\""
+  loop: "{{ new_workers_list }}"

+ 23 - 4
test/aws/scaleup.yml

@@ -2,6 +2,9 @@
 - name: create new nodes
   hosts: localhost
   connection: local
+  vars:
+    new_workers_list: []
+
   tasks:
   - import_tasks: ssh_bastion.yml
 
@@ -87,13 +90,13 @@
   - name: Mark CoreOS nodes as unschedulable
     command: >
       oc adm cordon {{ item | lower }}
-      --config={{ kubeconfig_path }}
+      --kubeconfig={{ kubeconfig_path }}
     with_items: "{{ pre_scaleup_workers_name }}"
 
   - name: Drain CoreOS nodes
     command: >
       oc adm drain {{ item | lower }}
-      --config={{ kubeconfig_path }}
+      --kubeconfig={{ kubeconfig_path }}
       --force --delete-local-data --ignore-daemonsets
       --timeout=0s
     with_items: "{{ pre_scaleup_workers_name }}"
@@ -108,5 +111,21 @@
       state: absent
     with_items: "{{ pre_scaleup_machineset_names }}"
 
-  - name: Wait for worker configs to roll out
-    command: oc wait machineconfigpool/worker --for=condition=Updated --timeout=5m
+  - block:
+    - name: Wait for worker configs to roll out
+      command: >
+        oc wait machineconfigpool/worker
+        --kubeconfig={{ kubeconfig_path }}
+        --for=condition=Updated
+        --timeout=5m
+
+    rescue:
+    - name: DEBUG - Get worker machine config pool
+      command: >
+        oc get machineconfigpool/worker
+        --kubeconfig={{ kubeconfig_path }}
+        --output=json
+
+    - name: DEBUG - Worker config rollout failed
+      fail:
+        msg: "Worker config rollout failed"