Browse Source

Add the ability to specify a timeout for node drain operations

Scott Dodson 7 years ago
parent
commit
0841917f05

+ 8 - 0
inventory/hosts.example

@@ -991,6 +991,14 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
 # where as this would not
 # openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=50
 #
+# A timeout to wait for nodes to drain pods can be specified to ensure that the
+# upgrade continues even if nodes fail to drain pods in the allowed time. The
+# default value of 0 will wait indefinitely allowing the admin to investigate
+# the root cause and ensuring that disruption budgets are respected. If the
+# a timeout of 0 is used there will also be one attempt to re-try draining the
+# node. If a non zero timeout is specified there will be no attempt to retry.
+#openshift_upgrade_nodes_drain_timeout=0
+#
 # Multiple data migrations take place and if they fail they will fail the upgrade
 # You may wish to disable these or make them non fatal
 #

+ 9 - 3
playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml

@@ -51,13 +51,19 @@
 
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ openshift_client_binary }} adm drain {{ openshift.node.nodename }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade
     register: l_docker_upgrade_drain_result
     until: not (l_docker_upgrade_drain_result is failed)
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
+    delay: 5
+    failed_when:
+    - l_docker_upgrade_drain_result is failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
   - include_tasks: tasks/upgrade.yml
     when: l_docker_upgrade is defined and l_docker_upgrade | bool

+ 9 - 3
playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml

@@ -291,12 +291,18 @@
 
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_control_plane_drain_result
     until: not (l_upgrade_control_plane_drain_result is failed)
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
+    delay: 5
+    failed_when:
+    - l_upgrade_control_plane_drain_result is failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
   roles:
   - openshift_facts

+ 9 - 3
playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml

@@ -35,12 +35,18 @@
 
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_nodes_drain_result
     until: not (l_upgrade_nodes_drain_result is failed)
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
+    delay: 5
+    failed_when:
+    - l_upgrade_nodes_drain_result is failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
   post_tasks:
   - import_role:

+ 2 - 2
playbooks/common/openshift-cluster/upgrades/upgrade_scale_group.yml

@@ -50,11 +50,11 @@
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_nodes_drain_result
     until: not (l_upgrade_nodes_drain_result is failed)
-    retries: "{{ 1 if openshift_upgrade_nodes_drain_timeout | default(0) == '0' else 0  | int }}"
+    retries: "{{ 1 if ( openshift_upgrade_nodes_drain_timeout | default(0) | int ) == 0 else 0 }}"
     delay: 5
     failed_when:
     - l_upgrade_nodes_drain_result is failed
-    - openshift_upgrade_nodes_drain_timeout | default(0) == '0'
+    - openshift_upgrade_nodes_drain_timeout | default(0) | int == 0
 
 # Alright, let's clean up!
 - name: clean up the old scale group