Przeglądaj źródła

Cleanup ovs file and restart docker on every upgrade.

In 3.3 one of our services lays down a systemd drop-in for configuring
Docker networking to use lbr0. In 3.4, this has been changed but the
file must be cleaned up manually by us.

However, after removing the file docker requires a restart. This had big
implications particularly in containerized environments where upgrade is
a very fragile series of upgrading and service restarts.

To avoid double docker restarts, and thus double service restarts in
containerized environments, this change does the following:

- Skip restart during docker upgrade, if it is required. We will restart
  on our own later.
- Skip containerized service restarts when we upgrade the services
  themselves.
- Clean shutdown of all containerized services.
- Restart Docker. (always, previously this only happened if it needed an
  upgrade)
- Ensure all containerized services are restarted.
- Restart rpm node services. (always)
- Mark node schedulable again.

At the end of this process, docker0 should be back on the system.
Devan Goodwin 8 lat temu
rodzic
commit
5c24cf417b

+ 11 - 6
playbooks/common/openshift-cluster/upgrades/containerized_node_upgrade.yml

@@ -1,9 +1,14 @@
+---
+# This is a hack to allow us to use systemd_units.yml, but skip the handlers which
+# restart services. We will unconditionally restart all containerized services
+# because we have to unconditionally restart Docker:
+- set_fact:
+    skip_node_svc_handlers: True
+
 - name: Update systemd units
   include: ../../../../roles/openshift_node/tasks/systemd_units.yml openshift_version={{ openshift_image_tag }}
 
-- name: Verifying the correct version was configured
-  shell: grep {{ verify_upgrade_version }} {{ item }}
-  with_items:
-    - /etc/sysconfig/openvswitch
-    - /etc/sysconfig/{{ openshift.common.service_type }}*
-  when: verify_upgrade_version is defined
+# This is a no-op because of skip_node_svc_handlers, but lets us trigger it before end of
+# play when the node has already been marked schedulable again. (this would look strange
+# in logs otherwise)
+- meta: flush_handlers

+ 29 - 0
playbooks/common/openshift-cluster/upgrades/docker/restart.yml

@@ -0,0 +1,29 @@
+---
+- name: Restart docker
+  service: name=docker state=restarted
+
+- name: Update docker facts
+  openshift_facts:
+    role: docker
+
+- name: Restart containerized services
+  service: name={{ item }} state=started
+  with_items:
+    - etcd_container
+    - openvswitch
+    - "{{ openshift.common.service_type }}-master"
+    - "{{ openshift.common.service_type }}-master-api"
+    - "{{ openshift.common.service_type }}-master-controllers"
+    - "{{ openshift.common.service_type }}-node"
+  failed_when: false
+  when: openshift.common.is_containerized | bool
+
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+  when: inventory_hostname in groups.oo_masters_to_config

+ 2 - 27
playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml

@@ -37,30 +37,5 @@
 - name: Upgrade Docker
   package: name=docker{{ '-' + docker_version }} state=present
 
-- service: name=docker state=started
-
-- name: Update docker facts
-  openshift_facts:
-    role: docker
-
-- name: Restart containerized services
-  service: name={{ item }} state=started
-  with_items:
-    - etcd_container
-    - openvswitch
-    - "{{ openshift.common.service_type }}-master"
-    - "{{ openshift.common.service_type }}-master-api"
-    - "{{ openshift.common.service_type }}-master-controllers"
-    - "{{ openshift.common.service_type }}-node"
-  failed_when: false
-  when: openshift.common.is_containerized | bool
-
-- name: Wait for master API to come back online
-  become: no
-  local_action:
-    module: wait_for
-      host="{{ inventory_hostname }}"
-      state=started
-      delay=10
-      port="{{ openshift.master.api_port }}"
-  when: inventory_hostname in groups.oo_masters_to_config
+- include: restart.yml
+  when: not skip_docker_restart | default(False) | bool

+ 0 - 4
playbooks/common/openshift-cluster/upgrades/rpm_upgrade.yml

@@ -6,7 +6,3 @@
 - name: Ensure python-yaml present for config upgrade
   package: name=PyYAML state=present
   when: not openshift.common.is_atomic | bool
-
-- name: Restart node service
-  service: name="{{ openshift.common.service_type }}-node" state=restarted
-  when: component == "node"

+ 1 - 0
playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml

@@ -29,6 +29,7 @@
 
 - name: Backup etcd
   include: ./etcd/backup.yml
+  when: openshift_upgrade_skip_etcd_backup | default(false) | bool
 
 - name: Upgrade master packages
   hosts: oo_masters_to_config

+ 26 - 1
playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml

@@ -44,8 +44,13 @@
       {{ hostvars[groups.oo_first_master.0].openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename | lower }} --evacuate --force
     delegate_to: "{{ groups.oo_first_master.0 }}"
     when: inventory_hostname in groups.oo_nodes_to_upgrade
+
   tasks:
+
   - include: docker/upgrade.yml
+    vars:
+      # We will restart Docker ourselves after everything is ready:
+      skip_docker_restart: True
     when: l_docker_upgrade is defined and l_docker_upgrade | bool and not openshift.common.is_atomic | bool
 
   - include: "{{ node_config_hook }}"
@@ -57,11 +62,31 @@
        openshift_version: "{{ openshift_pkg_version | default('') }}"
     when: inventory_hostname in groups.oo_nodes_to_upgrade and not openshift.common.is_containerized | bool
 
+  - name: Remove obsolete docker-sdn-ovs.conf
+    file: path=/etc/systemd/system/docker.service.d/docker-sdn-ovs.conf state=absent
+    when: (deployment_type == 'openshift-enterprise' and openshift_release | version_compare('3.4', '>=')) or (deployment_type == 'origin' and openshift_release | version_compare('1.4', '>='))
+
   - include: containerized_node_upgrade.yml
     when: inventory_hostname in groups.oo_nodes_to_upgrade and openshift.common.is_containerized | bool
 
-  - meta: flush_handlers
+  - name: Ensure containerized services stopped before Docker restart
+    service: name={{ item }} state=stopped
+    with_items:
+      - etcd_container
+      - openvswitch
+      - "{{ openshift.common.service_type }}-master"
+      - "{{ openshift.common.service_type }}-master-api"
+      - "{{ openshift.common.service_type }}-master-controllers"
+      - "{{ openshift.common.service_type }}-node"
+    failed_when: false
+    when: openshift.common.is_containerized | bool
+
+  # Mandatory Docker restart, ensure all containerized services are running:
+  - include: docker/restart.yml
 
+  - name: Restart rpm node service
+    service: name="{{ openshift.common.service_type }}-node" state=restarted
+    when: inventory_hostname in groups.oo_nodes_to_upgrade and not openshift.common.is_containerized | bool
   - name: Set node schedulability
     command: >
       {{ hostvars[groups.oo_first_master.0].openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename | lower }} --schedulable=true

+ 1 - 0
roles/openshift_facts/library/openshift_facts.py

@@ -1309,6 +1309,7 @@ def get_openshift_version(facts):
 
     # No need to run this method repeatedly on a system if we already know the
     # version
+    # TODO: We need a way to force reload this after upgrading bits.
     if 'common' in facts:
         if 'version' in facts['common'] and facts['common']['version'] is not None:
             return chomp_commit_offset(facts['common']['version'])

+ 3 - 3
roles/openshift_node/handlers/main.yml

@@ -1,14 +1,14 @@
 ---
 - name: restart openvswitch
   systemd: name=openvswitch state=restarted
-  when: not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool
+  when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool
   notify:
   - restart openvswitch pause
 
 - name: restart openvswitch pause
   pause: seconds=15
-  when: openshift.common.is_containerized | bool
+  when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool
 
 - name: restart node
   systemd: name={{ openshift.common.service_type }}-node state=restarted
-  when: not (node_service_status_changed | default(false) | bool)
+  when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool)