Browse Source

Mask and disable etcd service and remove etcd system container

* Fix etcd runtime detection when setting up etcd
* During upgrade etcd runtime should be detected using systemd services
status
* Mask, disable and stop services before removing service files
* System container should be removed as stopping service doesn't seem to
cut it
* etcd cluster health check would wait for etcd static pod container to
start
Vadim Rutkovsky 7 years ago
parent
commit
44b0efaf3c

+ 7 - 0
playbooks/openshift-etcd/private/upgrade_main.yml

@@ -38,6 +38,13 @@
 # mirrored packages on your own because only the GA and latest versions are
 # available in the repos. So for Fedora we'll simply skip this, sorry.
 
+- name: Detect etcd runtime
+  hosts: oo_etcd_hosts_to_upgrade
+  tasks:
+  - import_role:
+      name: etcd
+      tasks_from: runtime.yml
+
 - name: Backup etcd before upgrading anything
   import_playbook: upgrade_backup.yml
   vars:

+ 5 - 3
roles/etcd/defaults/main.yaml

@@ -2,12 +2,14 @@
 r_etcd_common_backup_tag: ''
 r_etcd_common_backup_sufix_name: ''
 
+l_etcd_bootstrapped: '{{ openshift.node.bootstrapped }}'
+
 l_is_etcd_system_container: "{{ (openshift_use_etcd_system_container | default(openshift_use_system_containers | default(false)) | bool) }}"
 
-l_etcd_static_pod: "{{ not (r_etcd_common_skip_command_shim is defined and r_etcd_common_skip_command_shim) or openshift.node.bootstrapped }}"
+l_etcd_static_pod: "{{ not (r_etcd_common_skip_command_shim is defined and r_etcd_common_skip_command_shim) or l_etcd_bootstrapped }}"
 
-# runc, docker, host
-r_etcd_common_etcd_runtime: "{{ 'static_pod' if l_etcd_static_pod else ('runc' if l_is_etcd_system_container else ('docker' if openshift_is_containerized else 'host')) }}"
+# runc, docker, static pod, host
+r_etcd_common_etcd_runtime: "{{ 'runc' if l_is_etcd_system_container else ('static_pod' if l_etcd_static_pod else ('docker' if openshift_is_containerized else 'host')) }}"
 
 r_etcd_default_version: "3.2.15"
 osm_etcd_image: "registry.access.redhat.com/rhel7/etcd:{{ r_etcd_upgrade_version | default(r_etcd_default_version) }}"

+ 24 - 0
roles/etcd/tasks/runtime.yml

@@ -0,0 +1,24 @@
+---
+- name: Check if etcd service exists
+  systemd:
+    name: "etcd"
+  ignore_errors: true
+  register: etcd_service
+
+- set_fact:
+    r_etcd_common_etcd_runtime: host
+  when: etcd_service.status['ActiveState'] == "active" | bool
+
+- name: Check if etcd service exists
+  systemd:
+    name: "etcd_container"
+  ignore_errors: true
+  register: etcd_container_service
+
+- set_fact:
+    r_etcd_common_etcd_runtime: docker
+  when: etcd_container_service.status['ActiveState'] == "active" | bool and not l_is_etcd_system_container
+
+- set_fact:
+    r_etcd_common_etcd_runtime: runc
+  when: etcd_container_service.status['ActiveState'] == "active" | bool and l_is_etcd_system_container

+ 19 - 17
roles/etcd/tasks/upgrade_static.yml

@@ -2,44 +2,46 @@
 # PREREQ Node service is ready to run static pods
 
 # INPUT r_etcd_upgrade_version
+
+- include_tasks: runtime.yml
+
 - name: Verify cluster is healthy pre-upgrade
   command: "{{ etcdctlv2 }} cluster-health"
 
-- name: Remove old etcd service files
-  file:
-    path: "{{ item }}"
-    state: absent
-  with_items:
-  - "/etc/systemd/system/etcd.service"
-  - "/etc/systemd/system/etcd_container.service"
-
 # We removed the ability to detect what was previously 'containerized'
 # Need to stop and disable this service, but might not be present.
-- name: Stop, disable old etcd services
+- name: Stop, disable and mask old etcd service
   systemd:
     name: "{{ item }}"
     state: stopped
     enabled: no
-  failed_when: False
+    masked: yes
+    daemon_reload: yes
   with_items:
   - etcd
   - etcd_container
+  failed_when: False
 
-- name: Mask old etcd services
-  command: "systemctl mask {{ etcd_service }}"
+- name: Remove old etcd service files
+  file:
+    path: "{{ item }}"
+    state: absent
   with_items:
-  - etcd
-  - etcd_container
+  - "/etc/systemd/system/etcd.service"
+  - "/etc/systemd/system/etcd_container.service"
 
-- name: Reload systemd daemon
-  command: "systemctl daemon-reload"
+- name: Remove nonexistent services
+  command: "systemctl reset-failed"
 
 - name: Configure static definition
   import_tasks: static.yml
 
+- set_fact:
+    r_etcd_common_etcd_runtime: static_pod
+
 - name: Verify cluster is healthy
   command: "{{ etcdctlv2 }} cluster-health"
   register: etcdctl
-  until: etcdctl.rc == 0
+  until: etcdctl.rc == 0 and 'stopped' not in etcdctl.stderr
   retries: 30
   delay: 10