Browse Source

Merge pull request #8661 from sdodson/crio-upgrade

Fix cri-o 3.9 to 3.10 upgrade
Scott Dodson 6 years ago
parent
commit
8c2c90db1c

+ 4 - 0
playbooks/openshift-master/private/upgrade.yml

@@ -67,6 +67,8 @@
     failed_when:
     - l_pb_upgrade_control_plane_pre_upgrade_storage.rc != 0
     - openshift_upgrade_pre_storage_migration_fatal | default(true) | bool
+    retries: 2
+    delay: 30
 
   - name: Migrate legacy HPA scale target refs
     command: >
@@ -200,6 +202,8 @@
     failed_when:
     - l_pb_upgrade_control_plane_post_upgrade_storage.rc != 0
     - openshift_upgrade_post_storage_migration_fatal | default(false) | bool
+    retries: 2
+    delay: 30
 
   - set_fact:
       reconcile_complete: True

+ 25 - 0
roles/openshift_control_plane/files/scripts/crio/master-exec

@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euo pipefail
+
+# Exec a file in the named component by component name and container name.
+# Remaining arguments are passed to the command. If no static pods have been
+# created yet this will execute on the host.
+if [[ -z "${1-}" || -z "${2-}" ]]; then
+  echo "A component name like 'api', 'etcd', or 'controllers' must be specified along with the container name within that component." 1>&2
+  exit 1
+fi
+
+# We haven't started using static pods yet, assume this command is to be direct executed
+if [[ ! -d /etc/origin/node/pods || -z "$( ls -A /etc/origin/node/pods )" ]]; then
+  exec "${@:3}"
+fi
+
+pod=$(crictl pods -l -q --label "openshift.io/component=${1}" --label "io.kubernetes.container.name=POD" 2>/dev/null)
+uid=$(crictl inspectp ${pod} 2>/dev/null | python -c 'import sys, json; print json.load(sys.stdin)["status"]["labels"]["io.kubernetes.pod.uid"]')
+
+if [[ -z "${uid}" ]]; then
+  echo "Component ${1} is stopped or not running" 1>&2
+  exit 0
+fi
+container=$(crictl ps -l -q --label "io.kubernetes.pod.uid=${uid}" --label "io.kubernetes.container.name=${2}" 2>/dev/null)
+exec crictl exec "${container}" "${@:3}"

+ 28 - 0
roles/openshift_control_plane/files/scripts/crio/master-logs

@@ -0,0 +1,28 @@
+#!/bin/bash
+set -euo pipefail
+
+# Return the logs for a given static pod by component name and container name. Remaining arguments are passed to the
+# current container runtime.
+if [[ -z "${1-}" || -z "${2-}" ]]; then
+  echo "A component name like 'api', 'etcd', or 'controllers' must be specified along with the container name within that component." 1>&2
+  exit 1
+fi
+
+# container name is ignored for services
+types=( "atomic-openshift" "origin" )
+for type in "${types[@]}"; do
+  if systemctl cat "${type}-master-${1}.service" &>/dev/null; then
+    journalctl -u "${type}-master-${1}.service" "${@:3}"
+    exit 0
+  fi
+done
+
+pod=$(crictl pods -l -q --label "openshift.io/component=${1}" --label "io.kubernetes.container.name=POD" 2>/dev/null)
+uid=$(crictl inspectp ${pod} 2>/dev/null | python -c 'import sys, json; print json.load(sys.stdin)["status"]["labels"]["io.kubernetes.pod.uid"]')
+
+if [[ -z "${uid}" ]]; then
+  echo "Component ${1} is stopped or not running" 1>&2
+  exit 0
+fi
+container=$(crictl ps -l -q --label "io.kubernetes.pod.uid=${uid}" --label "io.kubernetes.container.name=${2}" 2>/dev/null)
+exec crictl logs "${@:3}" "${container}"

+ 25 - 0
roles/openshift_control_plane/files/scripts/crio/master-restart

@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euo pipefail
+
+# Restart the named component by stopping its base container.
+if [[ -z "${1-}" ]]; then
+  echo "A component name like 'api', 'etcd', or 'controllers' must be specified." 1>&2
+  exit 1
+fi
+
+types=( "atomic-openshift" "origin" )
+for type in "${types[@]}"; do
+  if systemctl cat "${type}-master-${1}.service" &>/dev/null; then
+    systemctl restart "${type}-master-${1}.service"
+    exit 0
+  fi
+done
+
+pod=$(crictl pods -l -q --label "openshift.io/component=${1}" --label "io.kubernetes.container.name=POD" 2>/dev/null)
+if [[ -z "${pod}" ]]; then
+  echo "Component ${1} is already stopped" 1>&2
+  exit 0
+fi
+# Stop the pod
+# TODO(runcom): expose timeout in the CRI
+crictl stopp "${pod}" >/dev/null

+ 11 - 3
roles/openshift_control_plane/tasks/static_shim.yml

@@ -6,6 +6,14 @@
     dest: "/usr/local/bin/"
     mode: 0500
   with_items:
-  - scripts/docker/master-exec
-  - scripts/docker/master-logs
-  - scripts/docker/master-restart
+  - "scripts/{{ l_runtime }}/master-exec"
+  - "scripts/{{ l_runtime }}/master-logs"
+  - "scripts/{{ l_runtime }}/master-restart"
+  vars:
+    l_runtime: "{{ 'crio' if openshift_use_crio | default(False) else 'docker' }}"
+
+- name: Ensure cri-tools installed
+  package:
+    name: cri-tools
+    state: present
+  when: openshift_use_crio | default(False)

+ 24 - 0
roles/openshift_node/files/clean-up-crio-pods.sh

@@ -0,0 +1,24 @@
+#!/bin/bash
+for c in $(runc list -q); do
+        output=$(runc state $c | grep io.kubernetes.cri-o.ContainerType)
+        if [[ "$output" =~ "container" ]]; then
+                runc delete -f $c
+        fi
+        for m in $(mount | grep $c | awk '{print $3}'); do
+                umount -R $m
+        done
+done
+for c in $(runc list -q); do
+        output=$(runc state $c | grep io.kubernetes.cri-o.ContainerType)
+        if [[ "$output" =~ "sandbox" ]]; then
+                runc delete -f $c
+        fi
+        for m in $(mount | grep $c | awk '{print $3}'); do
+                umount -R $m
+        done
+done
+mount | grep overlay | awk '{print $3}' | xargs umount | true
+umount -R /var/lib/containers/storage/overlay
+umount -R /var/lib/containers/storage
+rm -rf /var/run/containers/storage/*
+rm -rf /var/lib/containers/storage/*

+ 13 - 8
roles/openshift_node/tasks/upgrade.yml

@@ -22,19 +22,24 @@
 
 - name: Ensure cri-o is updated
   package:
-    name: cri-o
+    name: "{{ crio_pkgs | join (',') }}"
     state: latest
   when:
   - openshift_use_crio | default(False)
   register: crio_update
+  vars:
+    crio_pkgs:
+    - "cri-o"
+    - "cri-tools"
 
-- name: Restart cri-o
-  systemd:
-    name: cri-o
-    state: restarted
-  when:
-  - openshift_use_crio | default(False)
-  - crio_update is changed
+- name: Remove CRI-O default configuration files
+  file:
+    path: "{{ item }}"
+    state: absent
+  with_items:
+  - "/etc/cni/net.d/200-loopback.conf"
+  - "/etc/cni/net.d/100-crio-bridge.conf"
+  when: crio_update | changed
 
 - name: install pre-pulled rpms.
   import_tasks: upgrade/rpm_upgrade_install.yml

+ 6 - 0
roles/openshift_node/tasks/upgrade/restart.yml

@@ -30,6 +30,12 @@
   retries: 3
   delay: 30
 
+- name: Restart cri-o
+  service:
+    name: cri-o
+    state: started
+  when: openshift_use_crio | default(False)
+
 - name: Start node service
   service:
     name: "{{ openshift_service_type }}-node"

+ 13 - 8
roles/openshift_node/tasks/upgrade/stop_services.yml

@@ -8,14 +8,6 @@
   - "{{ openshift_service_type }}-node"
   failed_when: false
 
-- name: Ensure static containerized services stopped before Docker restart
-  command: /usr/local/bin/master-restart "{{ item }}"
-  with_items:
-  - api
-  - controllers
-  - etcd
-  failed_when: false
-
 - service:
     name: docker
     state: stopped
@@ -26,3 +18,16 @@
   when:
   - l_docker_upgrade is defined
   - l_docker_upgrade | bool
+
+- name: Stop crio
+  service:
+    name: cri-o
+    state: stopped
+  when: openshift_use_crio | default(False)
+
+# TODO: Need to determine if this is needed long term or just 3.9 to 3.10
+# Upgrading cri-o, at least from 1.9 to 1.10, requires that all
+# pods be stopped
+- name: Clean up cri-o pods
+  script: clean-up-crio-pods.sh
+  when: openshift_use_crio | default(False)

+ 5 - 1
roles/openshift_node/tasks/upgrade_pre.yml

@@ -35,11 +35,15 @@
   - l_docker_upgrade | bool
 
 - name: Stage cri-o updates
-  command: "{{ ansible_pkg_mgr }} install -y --downloadonly cri-o"
+  command: "{{ ansible_pkg_mgr }} install -y --downloadonly {{ crio_pkgs | join(' ') }}"
   register: result
   until: result is succeeded
   when:
   - openshift_use_crio | default(False)
+  vars:
+    crio_pkgs:
+    - "cri-o"
+    - "cri-tools"
 
 - import_tasks: upgrade/rpm_upgrade.yml
   when: not openshift_is_atomic | bool