Jelajahi Sumber

Merge pull request #4698 from sdodson/service-retries

Service retries
Scott Dodson 7 tahun lalu
induk
melakukan
8e2be54c6f

+ 4 - 0
playbooks/adhoc/uninstall.yml

@@ -325,6 +325,10 @@
     service: name=docker state=restarted
     failed_when: false
     when: not (container_engine | changed)
+    register: l_docker_restart_docker_in_pb_result
+    until: not l_docker_restart_docker_in_pb_result | failed
+    retries: 3
+    delay: 30
 
 - hosts: masters
   become: yes

+ 4 - 0
playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml

@@ -55,6 +55,10 @@
       {{ openshift.common.admin_binary }} drain {{ openshift.node.nodename }} --force --delete-local-data --ignore-daemonsets
     delegate_to: "{{ groups.oo_first_master.0 }}"
     when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade
+    register: l_docker_upgrade_drain_result
+    until: not l_docker_upgrade_drain_result | failed
+    retries: 60
+    delay: 60
 
   - include: upgrade.yml
     when: l_docker_upgrade is defined and l_docker_upgrade | bool

+ 4 - 0
playbooks/common/openshift-cluster/upgrades/docker/restart.yml

@@ -1,6 +1,10 @@
 ---
 - name: Restart docker
   service: name=docker state=restarted
+  register: l_docker_restart_docker_in_upgrade_result
+  until: not l_docker_restart_docker_in_upgrade_result | failed
+  retries: 3
+  delay: 30
 
 - name: Update docker facts
   openshift_facts:

+ 7 - 1
playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml

@@ -32,7 +32,13 @@
 - debug: var=docker_image_count.stdout
   when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool
 
-- service: name=docker state=stopped
+- service:
+    name: docker
+    state: stopped
+  register: l_pb_docker_upgrade_stop_result
+  until: not l_pb_docker_upgrade_stop_result | failed
+  retries: 3
+  delay: 30
 
 - name: Upgrade Docker
   package: name=docker{{ '-' + docker_version }} state=present

+ 4 - 0
playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml

@@ -296,6 +296,10 @@
     command: >
       {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
     delegate_to: "{{ groups.oo_first_master.0 }}"
+    register: l_upgrade_control_plane_drain_result
+    until: not l_upgrade_control_plane_drain_result | failed
+    retries: 60
+    delay: 60
 
   roles:
   - lib_openshift

+ 4 - 0
playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml

@@ -28,6 +28,10 @@
     command: >
       {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets
     delegate_to: "{{ groups.oo_first_master.0 }}"
+    register: l_upgrade_nodes_drain_result
+    until: not l_upgrade_nodes_drain_result | failed
+    retries: 60
+    delay: 60
 
   roles:
   - lib_openshift

+ 4 - 0
playbooks/common/openshift-node/restart.yml

@@ -11,6 +11,10 @@
     service:
       name: docker
       state: restarted
+    register: l_docker_restart_docker_in_node_result
+    until: not l_docker_restart_docker_in_node_result | failed
+    retries: 3
+    delay: 30
 
   - name: Update docker facts
     openshift_facts:

+ 4 - 0
roles/calico/handlers/main.yml

@@ -8,3 +8,7 @@
   systemd:
     name: "{{ openshift.docker.service_name }}"
     state: restarted
+  register: l_docker_restart_docker_in_calico_result
+  until: not l_docker_restart_docker_in_calico_result | failed
+  retries: 3
+  delay: 30

+ 4 - 0
roles/contiv/tasks/netplugin.yml

@@ -108,6 +108,10 @@
     name: "{{ openshift.docker.service_name }}"
     state: restarted
   when: docker_updated|changed
+  register: l_docker_restart_docker_in_contiv_result
+  until: not l_docker_restart_docker_in_contiv_result | failed
+  retries: 3
+  delay: 30
 
 - name: Netplugin | Enable Netplugin
   service:

+ 1 - 2
roles/docker/handlers/main.yml

@@ -6,9 +6,8 @@
     state: restarted
   register: r_docker_restart_docker_result
   until: not r_docker_restart_docker_result | failed
-  retries: 1
+  retries: 3
   delay: 30
-
   when: not docker_service_status_changed | default(false) | bool
 
 - name: restart udev

+ 5 - 2
roles/docker/tasks/package_docker.yml

@@ -123,9 +123,12 @@
     enabled: yes
     state: started
     daemon_reload: yes
-  register: start_result
+  register: r_docker_package_docker_start_result
+  until: not r_docker_package_docker_start_result | failed
+  retries: 3
+  delay: 30
 
 - set_fact:
-    docker_service_status_changed: start_result | changed
+    docker_service_status_changed: "{{ r_docker_package_docker_start_result | changed }}"
 
 - meta: flush_handlers

+ 10 - 2
roles/docker/tasks/systemcontainer_docker.yml

@@ -46,6 +46,11 @@
     state: stopped
     daemon_reload: yes
   ignore_errors: True
+  register: r_docker_systemcontainer_docker_stop_result
+  until: not r_docker_systemcontainer_docker_stop_result | failed
+  retries: 3
+  delay: 30
+
 
 # Set http_proxy, https_proxy, and no_proxy in /etc/atomic.conf
 # regexp: the line starts with or without #, followed by the string
@@ -160,9 +165,12 @@
     enabled: yes
     state: started
     daemon_reload: yes
-  register: start_result
+  register: r_docker_systemcontainer_docker_start_result
+  until: not r_docker_systemcontainer_docker_start_result | failed
+  retries: 3
+  delay: 30
 
 - set_fact:
-    docker_service_status_changed: start_result | changed
+    docker_service_status_changed: "{{ r_docker_systemcontainer_docker_start_result | changed }}"
 
 - meta: flush_handlers

+ 4 - 0
roles/flannel/handlers/main.yml

@@ -8,3 +8,7 @@
   systemd:
     name: "{{ openshift.docker.service_name }}"
     state: restarted
+  register: l_docker_restart_docker_in_flannel_result
+  until: not l_docker_restart_docker_in_flannel_result | failed
+  retries: 3
+  delay: 30

+ 12 - 1
roles/openshift_node/handlers/main.yml

@@ -4,9 +4,14 @@
     name: openvswitch
     state: restarted
   when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool
+  register: l_openshift_node_stop_openvswitch_result
+  until: not l_openshift_node_stop_openvswitch_result | failed
+  retries: 3
+  delay: 30
   notify:
   - restart openvswitch pause
 
+
 - name: restart openvswitch pause
   pause: seconds=15
   when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool
@@ -15,7 +20,13 @@
   systemd:
     name: "{{ openshift.common.service_type }}-node"
     state: restarted
-  when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool)
+  register: l_openshift_node_restart_node_result
+  until: not l_openshift_node_restart_node_result | failed
+  retries: 3
+  delay: 30
+  when:
+  - (not skip_node_svc_handlers | default(False) | bool)
+  - not (node_service_status_changed | default(false) | bool)
 
 - name: reload sysctl.conf
   command: /sbin/sysctl -p

+ 16 - 0
roles/openshift_node/tasks/main.yml

@@ -118,8 +118,12 @@
     name: openvswitch.service
     enabled: yes
     state: started
+    daemon_reload: yes
   when: openshift.common.is_containerized | bool and openshift.common.use_openshift_sdn | bool
   register: ovs_start_result
+  until: not ovs_start_result | failed
+  retries: 3
+  delay: 30
 
 - set_fact:
     ovs_service_status_changed: "{{ ovs_start_result | changed }}"
@@ -212,15 +216,27 @@
     state: started
   when: openshift.common.is_containerized | bool
 
+
 - name: Start and enable node
   systemd:
     name: "{{ openshift.common.service_type }}-node"
     enabled: yes
     state: started
+    daemon_reload: yes
   register: node_start_result
   until: not node_start_result | failed
   retries: 1
   delay: 30
+  ignore_errors: true
+
+- name: Dump logs from node service if it failed
+  command: journalctl --no-pager -n 100 {{ openshift.common.service_type }}-node
+  when: node_start_result | failed
+
+- name: Abort if node failed to start
+  fail:
+    msg: Node failed to start please inspect the logs and try again
+  when: node_start_result | failed
 
 - set_fact:
     node_service_status_changed: "{{ node_start_result | changed }}"

+ 4 - 0
roles/openshift_node_certificates/handlers/main.yml

@@ -9,3 +9,7 @@
     name: "{{ openshift.docker.service_name }}"
     state: restarted
   when: not openshift_certificates_redeploy | default(false) | bool
+  register: l_docker_restart_docker_in_cert_result
+  until: not l_docker_restart_docker_in_cert_result | failed
+  retries: 3
+  delay: 30

+ 5 - 0
roles/openshift_node_upgrade/README.md

@@ -84,6 +84,11 @@ Including an example of how to use your role (for instance, with variables passe
     command: >
       {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets
     delegate_to: "{{ groups.oo_first_master.0 }}"
+    register: l_docker_upgrade_drain_result
+    until: not l_docker_upgrade_drain_result | failed
+    retries: 60
+    delay: 60
+
 
   roles:
   - openshift_facts

+ 17 - 3
roles/openshift_node_upgrade/handlers/main.yml

@@ -1,7 +1,13 @@
 ---
 - name: restart openvswitch
-  systemd: name=openvswitch state=restarted
+  systemd:
+    name: openvswitch
+    state: restarted
   when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool
+  register: l_openshift_node_upgrade_stop_openvswitch_result
+  until: not l_openshift_node_upgrade_stop_openvswitch_result | failed
+  retries: 3
+  delay: 30
   notify:
   - restart openvswitch pause
 
@@ -10,5 +16,13 @@
   when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool
 
 - name: restart node
-  systemd: name={{ openshift.common.service_type }}-node state=restarted
-  when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool)
+  systemd:
+    name: "{{ openshift.common.service_type }}-node"
+    state: restarted
+  register: l_openshift_node_upgrade_restart_node_result
+  until: not l_openshift_node_upgrade_restart_node_result | failed
+  retries: 3
+  delay: 30
+  when:
+  - (not skip_node_svc_handlers | default(False) | bool)
+  - not (node_service_status_changed | default(false) | bool)

+ 7 - 1
roles/openshift_node_upgrade/tasks/docker/upgrade.yml

@@ -26,7 +26,13 @@
 - debug: var=docker_image_count.stdout
   when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool
 
-- service: name=docker state=stopped
+- service:
+    name: docker
+    state: stopped
+  register: l_openshift_node_upgrade_docker_stop_result
+  until: not l_openshift_node_upgrade_docker_stop_result | failed
+  retries: 3
+  delay: 30
 
 - name: Upgrade Docker
   package: name=docker{{ '-' + docker_version }} state=present

+ 1 - 1
roles/openshift_node_upgrade/tasks/restart.yml

@@ -19,7 +19,7 @@
     state: started
   register: docker_start_result
   until: not docker_start_result | failed
-  retries: 1
+  retries: 3
   delay: 30
 
 - name: Update docker facts