Browse Source

Refactor node upgrade to include less serial tasks

This commit moves the pulling of images, packages,
and updating config files into a non-serialized play.

The serialized play is now in charge of marking unschedulable,
draining, stopping and restarting services, and marking
schedulable.

If rpm install / container download takes 60s per host,
this will save 3 hours and 10 minutes at 200 hosts per cluster
and forks of 20 hosts.
Michael Gugino 7 years ago
parent
commit
a6d5c62595

+ 22 - 11
playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml

@@ -1,16 +1,25 @@
 ---
+- name: Prepull images and rpms before doing rolling restart
+  hosts: oo_nodes_to_upgrade:!oo_masters_to_config
+  roles:
+  - role: openshift_facts
+  tasks:
+  - include_role:
+      name: openshift_node
+      tasks_from: upgrade_pre.yml
+    vars:
+      openshift_node_upgrade_in_progress: True
+
 - name: Drain and upgrade nodes
   hosts: oo_nodes_to_upgrade:!oo_masters_to_config
   # This var must be set with -e on invocation, as it is not a per-host inventory var
   # and is evaluated early. Values such as "20%" can also be used.
   serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
   max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
-
+  roles:
+  - lib_openshift
+  - openshift_facts
   pre_tasks:
-  - name: Load lib_openshift modules
-    import_role:
-      name: lib_openshift
-
   # TODO: To better handle re-trying failed upgrades, it would be nice to check if the node
   # or docker actually needs an upgrade before proceeding. Perhaps best to save this until
   # we merge upgrade functionality into the base roles and a normal config.yml playbook run.
@@ -33,18 +42,12 @@
     retries: 60
     delay: 60
 
-  roles:
-  - openshift_facts
   post_tasks:
   - include_role:
       name: openshift_node
       tasks_from: upgrade.yml
     vars:
       openshift_node_upgrade_in_progress: True
-  - include_role:
-      name: openshift_excluder
-    vars:
-      r_openshift_excluder_action: enable
   - name: Set node schedulability
     oc_adm_manage_node:
       node: "{{ openshift.node.nodename | lower }}"
@@ -55,3 +58,11 @@
     register: node_schedulable
     until: node_schedulable|succeeded
     when: node_unschedulable|changed
+
+- name: Re-enable excluders
+  hosts: oo_nodes_to_upgrade:!oo_masters_to_config
+  tasks:
+  - include_role:
+      name: openshift_excluder
+    vars:
+      r_openshift_excluder_action: enable

+ 6 - 0
roles/openshift_node/handlers/main.yml

@@ -4,11 +4,15 @@
     name: NetworkManager
     state: restarted
     enabled: True
+  when:
+  - (not skip_node_svc_handlers | default(False) | bool)
 
 - name: restart dnsmasq
   systemd:
     name: dnsmasq
     state: restarted
+  when:
+  - (not skip_node_svc_handlers | default(False) | bool)
 
 - name: restart openvswitch
   systemd:
@@ -47,3 +51,5 @@
 
 - name: reload systemd units
   command: systemctl daemon-reload
+  when:
+  - (not skip_node_svc_handlers | default(False) | bool)

+ 0 - 43
roles/openshift_node/tasks/dnsmasq.yml

@@ -1,43 +1,4 @@
 ---
-- name: Check for NetworkManager service
-  command: >
-    systemctl show NetworkManager
-  register: nm_show
-  changed_when: false
-  ignore_errors: True
-
-- name: Set fact using_network_manager
-  set_fact:
-    network_manager_active: "{{ True if 'ActiveState=active' in nm_show.stdout else False }}"
-
-- name: Install dnsmasq
-  package: name=dnsmasq state=installed
-  when: not openshift.common.is_atomic | bool
-  register: result
-  until: result | success
-
-- name: ensure origin/node directory exists
-  file:
-    state: directory
-    path: "{{ item }}"
-    owner: root
-    group: root
-    mode: '0700'
-  with_items:
-  - /etc/origin
-  - /etc/origin/node
-
-# this file is copied to /etc/dnsmasq.d/ when the node starts and is removed
-# when the node stops. A dbus-message is sent to dnsmasq to add the same entries
-# so that dnsmasq doesn't need to be restarted. Once we can use dnsmasq 2.77 or
-# newer we can use --server-file option to update the servers dynamically and
-# reload them by sending dnsmasq a SIGHUP. We write the file in case someone else
-# triggers a restart of dnsmasq but not a node restart.
-- name: Install node-dnsmasq.conf
-  template:
-    src: node-dnsmasq.conf.j2
-    dest: /etc/origin/node/node-dnsmasq.conf
-
 - name: Install dnsmasq configuration
   template:
     src: origin-dns.conf.j2
@@ -63,7 +24,3 @@
 # Dynamic NetworkManager based dispatcher
 - include_tasks: dnsmasq/network-manager.yml
   when: network_manager_active | bool
-
-# Relies on ansible in order to configure static config
-- include_tasks: dnsmasq/no-network-manager.yml
-  when: not network_manager_active | bool

+ 43 - 0
roles/openshift_node/tasks/dnsmasq_install.yml

@@ -0,0 +1,43 @@
+---
+- name: Check for NetworkManager service
+  command: >
+    systemctl show NetworkManager
+  register: nm_show
+  changed_when: false
+  ignore_errors: True
+
+- name: Set fact using_network_manager
+  set_fact:
+    network_manager_active: "{{ True if 'ActiveState=active' in nm_show.stdout else False }}"
+
+- name: Install dnsmasq
+  package: name=dnsmasq state=installed
+  when: not openshift.common.is_atomic | bool
+  register: result
+  until: result | success
+
+- name: ensure origin/node directory exists
+  file:
+    state: directory
+    path: "{{ item }}"
+    owner: root
+    group: root
+    mode: '0700'
+  with_items:
+  - /etc/origin
+  - /etc/origin/node
+
+# this file is copied to /etc/dnsmasq.d/ when the node starts and is removed
+# when the node stops. A dbus-message is sent to dnsmasq to add the same entries
+# so that dnsmasq doesn't need to be restarted. Once we can use dnsmasq 2.77 or
+# newer we can use --server-file option to update the servers dynamically and
+# reload them by sending dnsmasq a SIGHUP. We write the file in case someone else
+# triggers a restart of dnsmasq but not a node restart.
+- name: Install node-dnsmasq.conf
+  template:
+    src: node-dnsmasq.conf.j2
+    dest: /etc/origin/node/node-dnsmasq.conf
+
+# Relies on ansible in order to configure static config
+- include_tasks: dnsmasq/no-network-manager.yml
+  when: not network_manager_active | bool

+ 0 - 27
roles/openshift_node/tasks/docker/upgrade.yml

@@ -1,27 +0,0 @@
----
-# input variables:
-# - openshift_service_type
-# - openshift.common.is_containerized
-# - docker_version
-# - skip_docker_restart
-
-- name: Check Docker image count
-  shell: "docker images -aq | wc -l"
-  register: docker_image_count
-
-- debug: var=docker_image_count.stdout
-
-- service:
-    name: docker
-    state: stopped
-  register: l_openshift_node_upgrade_docker_stop_result
-  until: not l_openshift_node_upgrade_docker_stop_result | failed
-  retries: 3
-  delay: 30
-
-- name: Upgrade Docker
-  package: name=docker{{ '-' + docker_version }} state=present
-  register: result
-  until: result | success
-
-# starting docker happens back in ../main.yml where it calls ../restart.yml

+ 1 - 0
roles/openshift_node/tasks/main.yml

@@ -6,6 +6,7 @@
     - deployment_type == 'openshift-enterprise'
     - not openshift_use_crio
 
+- include_tasks: dnsmasq_install.yml
 - include_tasks: dnsmasq.yml
 
 - name: setup firewall

+ 19 - 108
roles/openshift_node/tasks/upgrade.yml

@@ -10,8 +10,6 @@
 
 # tasks file for openshift_node_upgrade
 
-- include_tasks: registry_auth.yml
-
 - name: Stop node and openvswitch services
   service:
     name: "{{ item }}"
@@ -21,58 +19,6 @@
   - openvswitch
   failed_when: false
 
-- name: Stop additional containerized services
-  service:
-    name: "{{ item }}"
-    state: stopped
-  with_items:
-  - "{{ openshift_service_type }}-master-controllers"
-  - "{{ openshift_service_type }}-master-api"
-  - etcd_container
-  failed_when: false
-  when: openshift.common.is_containerized | bool
-
-- name: Pre-pull node image
-  command: >
-    docker pull {{ openshift.node.node_image }}:{{ openshift_image_tag }}
-  register: pull_result
-  changed_when: "'Downloaded newer image' in pull_result.stdout"
-  when: openshift.common.is_containerized | bool
-
-- name: Pre-pull openvswitch image
-  command: >
-    docker pull {{ openshift.node.ovs_image }}:{{ openshift_image_tag }}
-  register: pull_result
-  changed_when: "'Downloaded newer image' in pull_result.stdout"
-  when:
-  - openshift.common.is_containerized | bool
-  - openshift_use_openshift_sdn | bool
-
-- include_tasks: docker/upgrade.yml
-  vars:
-    # We will restart Docker ourselves after everything is ready:
-    skip_docker_restart: True
-  when:
-  - l_docker_upgrade is defined
-  - l_docker_upgrade | bool
-
-- include_tasks: "{{ node_config_hook }}"
-  when: node_config_hook is defined
-
-- include_tasks: upgrade/rpm_upgrade.yml
-  vars:
-    component: "node"
-    openshift_version: "{{ openshift_pkg_version | default('') }}"
-  when: not openshift.common.is_containerized | bool
-
-- name: Remove obsolete docker-sdn-ovs.conf
-  file:
-    path: "/etc/systemd/system/docker.service.d/docker-sdn-ovs.conf"
-    state: absent
-
-- include_tasks: upgrade/containerized_node_upgrade.yml
-  when: openshift.common.is_containerized | bool
-
 - name: Ensure containerized services stopped before Docker restart
   service:
     name: "{{ item }}"
@@ -86,6 +32,17 @@
   failed_when: false
   when: openshift.common.is_containerized | bool
 
+- service:
+    name: docker
+    state: stopped
+  register: l_openshift_node_upgrade_docker_stop_result
+  until: not l_openshift_node_upgrade_docker_stop_result | failed
+  retries: 3
+  delay: 30
+  when:
+  - l_docker_upgrade is defined
+  - l_docker_upgrade | bool
+
 - name: Stop rpm based services
   service:
     name: "{{ item }}"
@@ -96,56 +53,19 @@
   failed_when: false
   when: not openshift.common.is_containerized | bool
 
+- include_tasks: "{{ node_config_hook }}"
+  when: node_config_hook is defined
+
 # https://bugzilla.redhat.com/show_bug.cgi?id=1513054
 - name: Clean up dockershim data
   file:
     path: "/var/lib/dockershim/sandbox/"
     state: absent
 
-- name: Upgrade openvswitch
-  package:
-    name: openvswitch
-    state: latest
-  when: not openshift.common.is_containerized | bool
-  register: result
-  until: result | success
-
-- name: Update oreg value
-  yedit:
-    src: "{{ openshift.common.config_base }}/node/node-config.yaml"
-    key: 'imageConfig.format'
-    value: "{{ oreg_url | default(oreg_url_node) }}"
-  when: oreg_url is defined or oreg_url_node is defined
-
-# https://docs.openshift.com/container-platform/3.4/admin_guide/overcommit.html#disabling-swap-memory
-- name: Check for swap usage
-  command: grep "^[^#].*swap" /etc/fstab
-  # grep: match any lines which don't begin with '#' and contain 'swap'
-  changed_when: false
-  failed_when: false
-  register: swap_result
-
-  # Disable Swap Block
-- block:
-
-  - name: Disable swap
-    command: swapoff --all
-
-  - name: Remove swap entries from /etc/fstab
-    replace:
-      dest: /etc/fstab
-      regexp: '(^[^#].*swap.*)'
-      replace: '# \1'
-      backup: yes
-
-  - name: Add notice about disabling swap
-    lineinfile:
-      dest: /etc/fstab
-      line: '# OpenShift-Ansible Installer disabled swap per overcommit guidelines'
-      state: present
-
+- name: Disable swap
+  command: swapoff --all
   when:
-  - swap_result.stdout_lines | length > 0
+  - openshift_node_upgrade_swap_result | default(False) | bool
   - openshift_disable_swap | default(true) | bool
   # End Disable Swap Block
 
@@ -155,17 +75,6 @@
   - ansible_selinux is defined
   - ansible_selinux.status == 'enabled'
 
-- name: Apply 3.6 dns config changes
-  yedit:
-    src: /etc/origin/node/node-config.yaml
-    key: "{{ item.key }}"
-    value: "{{ item.value }}"
-  with_items:
-  - key: "dnsBindAddress"
-    value: "127.0.0.1:53"
-  - key: "dnsRecursiveResolvConf"
-    value: "/etc/origin/node/resolv.conf"
-
 # Restart all services
 - include_tasks: upgrade/restart.yml
 
@@ -182,3 +91,5 @@
   delay: 5
 
 - include_tasks: dnsmasq.yml
+
+- meta: flush_handlers

+ 0 - 11
roles/openshift_node/tasks/upgrade/containerized_node_upgrade.yml

@@ -1,14 +1,3 @@
 ---
-# This is a hack to allow us to use systemd_units.yml, but skip the handlers which
-# restart services. We will unconditionally restart all containerized services
-# because we have to unconditionally restart Docker:
-- set_fact:
-    skip_node_svc_handlers: True
-
 - name: Update systemd units
   include_tasks: ../systemd_units.yml
-
-# This is a no-op because of skip_node_svc_handlers, but lets us trigger it before end of
-# play when the node has already been marked schedulable again. (this would look strange
-# in logs otherwise)
-- meta: flush_handlers

+ 9 - 0
roles/openshift_node/tasks/upgrade/restart.yml

@@ -13,6 +13,15 @@
 - name: Reload systemd to ensure latest unit files
   command: systemctl daemon-reload
 
+- name: Restart support services
+  service:
+    name: "{{ item }}"
+    state: restarted
+    enabled: True
+  with_items:
+    - NetworkManager
+    - dnsmasq
+
 - name: Restart container runtime
   service:
     name: "{{ openshift_docker_service_name }}"

+ 118 - 0
roles/openshift_node/tasks/upgrade_pre.yml

@@ -0,0 +1,118 @@
+---
+# This is a hack to allow us to update various components without restarting
+# services.  This will persist into the upgrade play as well, so everything
+# needs to be restarted by hand.
+- set_fact:
+    skip_node_svc_handlers: True
+
+- include_tasks: registry_auth.yml
+
+- name: Check Docker image count
+  shell: "docker images -aq | wc -l"
+  register: docker_image_count
+  when:
+  - l_docker_upgrade is defined
+  - l_docker_upgrade | bool
+
+- debug: var=docker_image_count.stdout
+  when:
+  - l_docker_upgrade is defined
+  - l_docker_upgrade | bool
+
+- name: Upgrade Docker
+  package: name=docker{{ '-' + docker_version }} state=present
+  register: result
+  until: result | success
+  when:
+  - l_docker_upgrade is defined
+  - l_docker_upgrade | bool
+
+- name: Pre-pull node image
+  command: >
+    docker pull {{ openshift.node.node_image }}:{{ openshift_image_tag }}
+  register: pull_result
+  changed_when: "'Downloaded newer image' in pull_result.stdout"
+  when: openshift.common.is_containerized | bool
+
+- name: Pre-pull openvswitch image
+  command: >
+    docker pull {{ openshift.node.ovs_image }}:{{ openshift_image_tag }}
+  register: pull_result
+  changed_when: "'Downloaded newer image' in pull_result.stdout"
+  when:
+  - openshift.common.is_containerized | bool
+  - openshift_use_openshift_sdn | bool
+
+- include_tasks: upgrade/rpm_upgrade.yml
+  vars:
+    component: "node"
+    openshift_version: "{{ openshift_pkg_version | default('') }}"
+  when: not openshift.common.is_containerized | bool
+
+- name: Remove obsolete docker-sdn-ovs.conf
+  file:
+    path: "/etc/systemd/system/docker.service.d/docker-sdn-ovs.conf"
+    state: absent
+
+- include_tasks: upgrade/containerized_node_upgrade.yml
+  when: openshift.common.is_containerized | bool
+
+- name: Upgrade openvswitch
+  package:
+    name: openvswitch
+    state: latest
+  when: not openshift.common.is_containerized | bool
+  register: result
+  until: result | success
+
+- name: Update oreg value
+  yedit:
+    src: "{{ openshift.common.config_base }}/node/node-config.yaml"
+    key: 'imageConfig.format'
+    value: "{{ oreg_url | default(oreg_url_node) }}"
+  when: oreg_url is defined or oreg_url_node is defined
+
+# https://docs.openshift.com/container-platform/3.4/admin_guide/overcommit.html#disabling-swap-memory
+- name: Check for swap usage
+  command: grep "^[^#].*swap" /etc/fstab
+  # grep: match any lines which don't begin with '#' and contain 'swap'
+  changed_when: false
+  failed_when: false
+  register: swap_result
+
+# Set this fact here so we can use it during the next play, which is serial.
+- name: set_fact swap_result
+  set_fact:
+    openshift_node_upgrade_swap_result: "{{ swap_result.stdout_lines | length > 0 | bool }}"
+
+# Disable Swap Block (pre)
+- block:
+  - name: Remove swap entries from /etc/fstab
+    replace:
+      dest: /etc/fstab
+      regexp: '(^[^#].*swap.*)'
+      replace: '# \1'
+      backup: yes
+
+  - name: Add notice about disabling swap
+    lineinfile:
+      dest: /etc/fstab
+      line: '# OpenShift-Ansible Installer disabled swap per overcommit guidelines'
+      state: present
+  when:
+  - openshift_node_upgrade_swap_result | default(False) | bool
+  - openshift_disable_swap | default(true) | bool
+  # End Disable Swap Block
+
+- name: Apply 3.6 dns config changes
+  yedit:
+    src: /etc/origin/node/node-config.yaml
+    key: "{{ item.key }}"
+    value: "{{ item.value }}"
+  with_items:
+  - key: "dnsBindAddress"
+    value: "127.0.0.1:53"
+  - key: "dnsRecursiveResolvConf"
+    value: "/etc/origin/node/resolv.conf"
+
+- include_tasks: dnsmasq_install.yml