Browse Source

Refactor 3.2 upgrade to avoid killing nodes without evac.

We now handle the two pieces of upgrade that require a node evac in the
same play. (docker, and node itself)
Devan Goodwin 8 years ago
parent
commit
82f4e4eaea

+ 3 - 85
playbooks/byo/openshift-cluster/upgrades/docker/docker_upgrade.yml

@@ -4,6 +4,7 @@
   roles:
   - openshift_facts
   tasks:
+
   - set_fact:
       repoquery_cmd: "{{ 'dnf repoquery --latest-limit 1 -d 0' if ansible_pkg_mgr == 'dnf' else 'repoquery' }}"
 
@@ -11,42 +12,7 @@
       msg: Cannot upgrade Docker on Atomic hosts
     when: openshift.common.is_atomic | bool
 
-  - name: Determine available Docker version
-    script: ../../../../common/openshift-cluster/upgrades/files/rpm_versions.sh docker
-    register: g_docker_version_result
-
-  - name: Check if Docker is installed
-    command: rpm -q docker
-    register: pkg_check
-    failed_when: pkg_check.rc > 1
-    changed_when: no
-
-  - name: Get current version of Docker
-    command: "{{ repoquery_cmd }} --installed --qf '%{version}' docker"
-    register: curr_docker_version
-    changed_when: false
-
-  - name: Get latest available version of Docker
-    command: >
-      {{ repoquery_cmd }} --qf '%{version}' "docker"
-    register: avail_docker_version
-    failed_when: false
-    changed_when: false
-
-  - fail:
-      msg: This playbook requires access to Docker 1.10 or later
-    # Disable the 1.10 requirement if the user set a specific Docker version
-    when: avail_docker_version.stdout | version_compare('1.10','<') and docker_version is not defined
-
-  - name: Flag for upgrade if Docker version does not equal latest
-    set_fact:
-      docker_upgrade: true
-    when: docker_version is not defined and pkg_check.rc == 0 and curr_docker_version.stdout | version_compare(avail_docker_version.stdout,'<')
-
-  - name: Flag for upgrade if Docker version does not equal requested version
-    set_fact:
-      docker_upgrade: true
-    when: docker_version is defined and pkg_check.rc == 0 and curr_docker_version.stdout | version_compare(docker_version,'<')
+  - include: ../../../common/openshift-cluster/upgrades/docker/upgrade_check-yml
 
 
 # If a node fails, halt everything, the admin will need to clean up and we
@@ -69,55 +35,7 @@
     delegate_to: "{{ groups.oo_first_master.0 }}"
     when: docker_upgrade is defined and docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_config
 
-  - name: Stop containerized services
-    service: name={{ item }} state=stopped
-    with_items:
-      - "{{ openshift.common.service_type }}-master"
-      - "{{ openshift.common.service_type }}-master-api"
-      - "{{ openshift.common.service_type }}-master-controllers"
-      - "{{ openshift.common.service_type }}-node"
-      - etcd_container
-      - openvswitch
-    failed_when: false
-    when: docker_upgrade is defined and docker_upgrade | bool and openshift.common.is_containerized | bool
-
-  - name: Remove all containers and images
-    script: files/nuke_images.sh docker
-    register: nuke_images_result
-    when: docker_upgrade is defined and docker_upgrade | bool
-
-  # TODO: should we use the docker role to actually do the upgrade?
-  - name: Upgrade to specified Docker version
-    action: "{{ ansible_pkg_mgr }} name=docker{{ '-' + docker_version }} state=present"
-    register: docker_upgrade_result
-    when: docker_upgrade is defined and docker_upgrade | bool and docker_version is defined
-
-  - name: Upgrade to latest Docker version
-    action: "{{ ansible_pkg_mgr }} name=docker state=latest"
-    register: docker_upgrade_result
-    when: docker_upgrade is defined and docker_upgrade | bool and docker_version is not defined
-
-  - name: Restart containerized services
-    service: name={{ item }} state=started
-    with_items:
-      - etcd_container
-      - openvswitch
-      - "{{ openshift.common.service_type }}-master"
-      - "{{ openshift.common.service_type }}-master-api"
-      - "{{ openshift.common.service_type }}-master-controllers"
-      - "{{ openshift.common.service_type }}-node"
-    failed_when: false
-    when: docker_upgrade is defined and docker_upgrade | bool and openshift.common.is_containerized | bool
-
-  - name: Wait for master API to come back online
-    become: no
-    local_action:
-      module: wait_for
-        host="{{ inventory_hostname }}"
-        state=started
-        delay=10
-        port="{{ openshift.master.api_port }}"
-    when: docker_upgrade is defined and docker_upgrade | bool and inventory_hostname in groups.oo_masters_to_config
+  - include: ../../../common/openshift-cluster/upgrades/docker/upgrade.yml
 
   - name: Set node schedulability
     command: >

+ 52 - 0
playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml

@@ -0,0 +1,52 @@
+---
+# We need docker service up to remove all the images, but these services will keep
+# trying to re-start and thus re-pull the images we're trying to delete.
+- name: stop containerized services
+  service: name={{ item }} state=stopped
+  with_items:
+    - "{{ openshift.common.service_type }}-master"
+    - "{{ openshift.common.service_type }}-master-api"
+    - "{{ openshift.common.service_type }}-master-controllers"
+    - "{{ openshift.common.service_type }}-node"
+    - etcd_container
+    - openvswitch
+  failed_when: false
+  when: docker_upgrade is defined and docker_upgrade | bool and openshift.common.is_containerized | bool
+
+- name: remove all containers and images
+  script: nuke_images.sh docker
+  register: nuke_images_result
+  when: docker_upgrade is defined and docker_upgrade | bool
+
+# todo: should we use the docker role to actually do the upgrade?
+- name: upgrade to specified docker version
+  action: "{{ ansible_pkg_mgr }} name=docker{{ '-' + docker_version }} state=present"
+  register: docker_upgrade_result
+  when: docker_upgrade is defined and docker_upgrade | bool and docker_version is defined
+
+- name: upgrade to latest docker version
+  action: "{{ ansible_pkg_mgr }} name=docker state=latest"
+  register: docker_upgrade_result
+  when: docker_upgrade is defined and docker_upgrade | bool and docker_version is not defined
+
+- name: restart containerized services
+  service: name={{ item }} state=started
+  with_items:
+    - etcd_container
+    - openvswitch
+    - "{{ openshift.common.service_type }}-master"
+    - "{{ openshift.common.service_type }}-master-api"
+    - "{{ openshift.common.service_type }}-master-controllers"
+    - "{{ openshift.common.service_type }}-node"
+  failed_when: false
+  when: docker_upgrade is defined and docker_upgrade | bool and openshift.common.is_containerized | bool
+
+- name: wait for master api to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+  when: docker_upgrade is defined and docker_upgrade | bool and inventory_hostname in groups.oo_masters_to_config

+ 37 - 0
playbooks/common/openshift-cluster/upgrades/docker/upgrade_check.yml

@@ -0,0 +1,37 @@
+---
+- name: Determine available Docker version
+  script: ../../../../common/openshift-cluster/upgrades/files/rpm_versions.sh docker
+  register: g_docker_version_result
+
+- name: Check if Docker is installed
+  command: rpm -q docker
+  register: pkg_check
+  failed_when: pkg_check.rc > 1
+  changed_when: no
+
+- name: Get current version of Docker
+  command: "{{ repoquery_cmd }} --installed --qf '%{version}' docker"
+  register: curr_docker_version
+  changed_when: false
+
+- name: Get latest available version of Docker
+  command: >
+    {{ repoquery_cmd }} --qf '%{version}' "docker"
+  register: avail_docker_version
+  failed_when: false
+  changed_when: false
+
+- fail:
+    msg: This playbook requires access to Docker 1.10 or later
+  # Disable the 1.10 requirement if the user set a specific Docker version
+  when: avail_docker_version.stdout | version_compare('1.10','<') and docker_version is not defined
+
+- name: Flag for upgrade if Docker version does not equal latest
+  set_fact:
+    docker_upgrade: true
+  when: docker_version is not defined and pkg_check.rc == 0 and curr_docker_version.stdout | version_compare(avail_docker_version.stdout,'<')
+
+- name: Flag for upgrade if Docker version does not equal requested version
+  set_fact:
+    docker_upgrade: true
+  when: docker_version is defined and pkg_check.rc == 0 and curr_docker_version.stdout | version_compare(docker_version,'<')

playbooks/byo/openshift-cluster/upgrades/docker/files/nuke_images.sh → playbooks/common/openshift-cluster/upgrades/files/nuke_images.sh


playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_upgrade.yml → playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/containerized_node_upgrade.yml


+ 0 - 14
playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/docker_upgrade.yml

@@ -1,14 +0,0 @@
-- name: Check if Docker is installed
-  command: rpm -q docker
-  register: pkg_check
-  failed_when: pkg_check.rc > 1
-  changed_when: no
-
-- name: Upgrade Docker
-  command: "{{ ansible_pkg_mgr}} update -y docker"
-  when: pkg_check.rc == 0 and g_docker_version.curr_version | version_compare('1.9','<')
-  register: docker_upgrade
-
-- name: Restart Docker
-  command: systemctl restart docker
-  when: docker_upgrade | changed

+ 0 - 24
playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/node_upgrade.yml

@@ -1,24 +0,0 @@
-- name: Prepare for Node evacuation
-  command: >
-    {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=false
-  delegate_to: "{{ groups.oo_first_master.0 }}"
-
-- name: Evacuate Node for Kubelet upgrade
-  command: >
-    {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --evacuate --force
-  delegate_to: "{{ groups.oo_first_master.0 }}"
-
-- include: rpm_upgrade.yml
-  vars:
-     component: "node"
-     openshift_version: "{{ openshift_pkg_version | default('') }}"
-  when: not openshift.common.is_containerized | bool
-
-- include: containerized_upgrade.yml
-  when: openshift.common.is_containerized | bool
-
-- name: Set node schedulability
-  command: >
-    {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=true
-  delegate_to: "{{ groups.oo_first_master.0 }}"
-  when: openshift.node.schedulable | bool

+ 1 - 1
playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/pre.yml

@@ -3,7 +3,7 @@
 # Evaluate host groups and gather facts
 ###############################################################################
 
-- include: ../../common/openshift-cluster/initialize_facts.yml
+- include: ../../initialize_facts.yml
 
 - name: Update repos
   hosts: oo_masters_to_config:oo_nodes_to_config:oo_etcd_to_config:oo_lb_to_config

+ 37 - 35
playbooks/common/openshift-cluster/upgrades/v3_1_to_v3_2/upgrade.yml

@@ -3,19 +3,6 @@
 # The restart playbook should be run after this playbook completes.
 ###############################################################################
 
-- name: Upgrade docker
-  hosts: oo_masters_to_config:oo_nodes_to_config:oo_etcd_to_config
-  roles:
-  - openshift_facts
-  tasks:
-  - include: docker_upgrade.yml
-    when: not openshift.common.is_atomic | bool
-  - name: Set post docker install facts
-    openshift_facts:
-      role: "{{ item.role }}"
-    with_items:
-    - role: docker
-
 ###############################################################################
 # Upgrade Masters
 ###############################################################################
@@ -68,36 +55,51 @@
 ###############################################################################
 # Upgrade Nodes
 ###############################################################################
-- name: Upgrade nodes
-  hosts: oo_nodes_to_config
+
+# Here we handle all tasks that might require a node evac. (upgrading docker, and the node service)
+- name: Perform upgrades that may require node evacuation
+  hosts: oo_masters_to_config:oo_etcd_to_config:oo_nodes_to_config
   serial: 1
+  any_errors_fatal: true
   roles:
   - openshift_facts
   handlers:
   - include: ../../../../../roles/openshift_node/handlers/main.yml
   tasks:
-  - include: node_upgrade.yml
+  # TODO: To better handle re-trying failed upgrades, it would be nice to check if the node
+  # or docker actually needs an upgrade before proceeding.
+  - name: Mark unschedulable if host is a node
+    command: >
+      {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=false
+    delegate_to: "{{ groups.oo_first_master.0 }}"
+    when: inventory_hostname in groups.oo_nodes_to_config
 
-  - set_fact:
-      node_update_complete: True
+  - name: Evacuate Node for Kubelet upgrade
+    command: >
+      {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --evacuate --force
+    delegate_to: "{{ groups.oo_first_master.0 }}"
+    when: inventory_hostname in groups.oo_nodes_to_config
+
+  - include: ../docker/upgrade_check.yml
+
+  - include: ../docker/upgrade.yml
+    when: docker_upgrade is defined and docker_upgrade | bool
+
+  - include: rpm_upgrade.yml
+    vars:
+       component: "node"
+       openshift_version: "{{ openshift_pkg_version | default('') }}"
+    when: inventory_hostname in groups.oo_nodes_to_config and not openshift.common.is_containerized | bool
+
+  - include: containerized_node_upgrade.yml
+    when: inventory_hostname in groups.oo_nodes_to_config and openshift.common.is_containerized | bool
+
+  - name: Set node schedulability
+    command: >
+      {{ openshift.common.admin_binary }} manage-node {{ openshift.common.hostname | lower }} --schedulable=true
+    delegate_to: "{{ groups.oo_first_master.0 }}"
+    when: inventory_hostname in groups.oo_nodes_to_config and openshift.node.schedulable | bool
 
-##############################################################################
-# Gate on nodes update
-##############################################################################
-- name: Gate on nodes update
-  hosts: localhost
-  connection: local
-  become: no
-  tasks:
-  - set_fact:
-      node_update_completed: "{{ hostvars
-                                 | oo_select_keys(groups.oo_nodes_to_config)
-                                 | oo_collect('inventory_hostname', {'node_update_complete': true}) }}"
-  - set_fact:
-      node_update_failed: "{{ groups.oo_nodes_to_config | difference(node_update_completed) }}"
-  - fail:
-      msg: "Upgrade cannot continue. The following nodes did not finish updating: {{ node_update_failed | join(',') }}"
-    when: node_update_failed | length > 0
 
 ###############################################################################
 # Reconcile Cluster Roles, Cluster Role Bindings and Security Context Constraints