Browse Source

Merge pull request #2562 from sdodson/etcd3

etcd upgrade playbooks
Scott Dodson 8 years ago
parent
commit
ae34dc20b9

+ 5 - 5
README_CONTAINERIZED_INSTALLATION.md

@@ -31,7 +31,7 @@ native clients.
 The wrapper scripts mount a limited subset of paths, _~/.kube_, _/etc/origin/_,
 and _/tmp_. Be mindful of this when passing in files to be processed by `oc` or
  `oadm`. You may find it easier to redirect input like this :
- 
+
  `oc create -f - < my_file.json`
 
 ## Technical Notes
@@ -48,18 +48,18 @@ before attempting to pull any of the following images.
         openshift/origin
         openshift/node (node + openshift-sdn + openvswitch rpm for client tools)
         openshift/openvswitch (centos7 + openvswitch rpm, runs ovsdb ovsctl processes)
-        registry.access.redhat.com/rhel7/etcd
+        registry.access.redhat.com/rhel7/etcd3
     OpenShift Enterprise
         openshift3/ose
         openshift3/node
         openshift3/openvswitch
-        registry.access.redhat.com/rhel7/etcd
+        registry.access.redhat.com/rhel7/etcd3
     Atomic Enterprise Platform
         aep3/aep
         aep3/node
         aep3/openvswitch
-        registry.access.redhat.com/rhel7/etcd
-        
+        registry.access.redhat.com/rhel7/etcd3
+
   * note openshift3/* and aep3/* images come from registry.access.redhat.com and
 rely on the --additional-repository flag being set appropriately.
 

+ 3 - 0
playbooks/adhoc/uninstall.yml

@@ -338,6 +338,7 @@
     failed_when: False
     with_items:
     - etcd
+    - etcd3
     - firewalld
 
   - name: Stop additional atomic services
@@ -352,6 +353,7 @@
     when: not is_atomic | bool
     with_items:
     - etcd
+    - etcd3
 
   - shell: systemctl reset-failed
     changed_when: False
@@ -365,6 +367,7 @@
     - /etc/ansible/facts.d/openshift.fact
     - /etc/etcd
     - /etc/systemd/system/etcd_container.service
+    - /etc/profile.d/etcdctl.sh
 
   # Intenationally using rm command over file module because if someone had mounted a filesystem
   # at /var/lib/etcd then the contents was not removed correctly

+ 26 - 0
playbooks/byo/openshift-cluster/upgrades/upgrade_etcd.yml

@@ -0,0 +1,26 @@
+---
+- include: ../../../common/openshift-cluster/verify_ansible_version.yml
+
+- name: Create initial host groups for localhost
+  hosts: localhost
+  connection: local
+  become: no
+  gather_facts: no
+  tags:
+  - always
+  tasks:
+  - include_vars: ../cluster_hosts.yml
+  - add_host:
+      name: "{{ item }}"
+      groups: l_oo_all_hosts
+    with_items: "{{ g_all_hosts | default([]) }}"
+
+- name: Create initial host groups for all hosts
+  hosts: l_oo_all_hosts
+  gather_facts: no
+  tags:
+  - always
+  tasks:
+  - include_vars: ../cluster_hosts.yml
+
+- include: ../../../common/openshift-cluster/upgrades/etcd/main.yml

+ 73 - 0
playbooks/common/openshift-cluster/upgrades/etcd/backup.yml

@@ -0,0 +1,73 @@
+- name: Backup etcd
+  hosts: etcd_hosts_to_backup
+  vars:
+    embedded_etcd: "{{ hostvars[groups.oo_first_master.0].openshift.master.embedded_etcd }}"
+    timestamp: "{{ lookup('pipe', 'date +%Y%m%d%H%M%S') }}"
+  roles:
+  - openshift_facts
+  tasks:
+  # Ensure we persist the etcd role for this host in openshift_facts
+  - openshift_facts:
+      role: etcd
+      local_facts: {}
+    when: "'etcd' not in openshift"
+
+  - stat: path=/var/lib/openshift
+    register: var_lib_openshift
+
+  - stat: path=/var/lib/origin
+    register: var_lib_origin
+
+  - name: Create origin symlink if necessary
+    file: src=/var/lib/openshift/ dest=/var/lib/origin state=link
+    when: var_lib_openshift.stat.exists == True and var_lib_origin.stat.exists == False
+
+  # TODO: replace shell module with command and update later checks
+  # We assume to be using the data dir for all backups.
+  - name: Check available disk space for etcd backup
+    shell: df --output=avail -k {{ openshift.common.data_dir }} | tail -n 1
+    register: avail_disk
+
+  # TODO: replace shell module with command and update later checks
+  - name: Check current embedded etcd disk usage
+    shell: du -k {{ openshift.etcd.etcd_data_dir }} | tail -n 1 | cut -f1
+    register: etcd_disk_usage
+    when: embedded_etcd | bool
+
+  - name: Abort if insufficient disk space for etcd backup
+    fail:
+      msg: >
+        {{ etcd_disk_usage.stdout }} Kb disk space required for etcd backup,
+        {{ avail_disk.stdout }} Kb available.
+    when: (embedded_etcd | bool) and (etcd_disk_usage.stdout|int > avail_disk.stdout|int)
+
+  - name: Install etcd (for etcdctl)
+    action: "{{ ansible_pkg_mgr }} name=etcd state=present"
+    when: not openshift.common.is_atomic | bool
+
+  - name: Generate etcd backup
+    command: >
+      etcdctl backup --data-dir={{ openshift.etcd.etcd_data_dir }}
+      --backup-dir={{ openshift.common.data_dir }}/etcd-backup-{{ backup_tag | default('') }}{{ timestamp }}
+
+  - set_fact:
+      etcd_backup_complete: True
+
+  - name: Display location of etcd backup
+    debug:
+      msg: "Etcd backup created in {{ openshift.common.data_dir }}/etcd-backup-{{ backup_tag | default('') }}{{ timestamp }}"
+
+- name: Gate on etcd backup
+  hosts: localhost
+  connection: local
+  become: no
+  tasks:
+  - set_fact:
+      etcd_backup_completed: "{{ hostvars
+                                 | oo_select_keys(groups.etcd_hosts_to_backup)
+                                 | oo_collect('inventory_hostname', {'etcd_backup_complete': true}) }}"
+  - set_fact:
+      etcd_backup_failed: "{{ groups.etcd_hosts_to_backup | difference(etcd_backup_completed) }}"
+  - fail:
+      msg: "Upgrade cannot continue. The following hosts did not complete etcd backup: {{ etcd_backup_failed | join(',') }}"
+    when: etcd_backup_failed | length > 0

+ 47 - 0
playbooks/common/openshift-cluster/upgrades/etcd/containerized_tasks.yml

@@ -0,0 +1,47 @@
+---
+- name: Verify cluster is healthy pre-upgrade
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+
+- name: Get current image
+  shell: grep 'ExecStart=' /etc/systemd/system/etcd_container.service | awk '{print $NF}'
+  register: current_image
+
+- name: Set new_etcd_image
+  set_fact:
+    new_etcd_image: "{{ current_image.stdout | regex_replace('/etcd.*$','/etcd3:' ~ upgrade_version ) if upgrade_version | version_compare('3.0','>=')
+                        else current_image.stdout.split(':')[0] ~ ':' ~ upgrade_version }}"
+
+- name: Pull new etcd image
+  command: "docker pull {{ new_etcd_image }}"
+
+- name: Update to latest etcd image
+  replace:
+    dest: /etc/systemd/system/etcd_container.service
+    regexp: "{{ current_image.stdout }}$"
+    replace: "{{ new_etcd_image }}"
+
+- name: Restart etcd_container
+  systemd:
+    name: etcd_container
+    daemon_reload: yes
+    state: restarted
+
+## TODO: probably should just move this into the backup playbooks, also this
+## will fail on atomic host. We need to revisit how to do etcd backups there as
+## the container may be newer than etcdctl on the host. Assumes etcd3 obsoletes etcd (7.3.1)
+- name: Upgrade etcd for etcdctl when not atomic
+  action: "{{ ansible_pkg_mgr }} name=etcd ensure=latest"
+  when: not openshift.common.is_atomic | bool
+
+- name: Verify cluster is healthy
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+  register: etcdctl
+  until: etcdctl.rc == 0
+  retries: 3
+  delay: 10
+
+- name: Store new etcd_image
+  openshift_facts:
+    role: etcd
+    local_facts:
+      etcd_image: "{{ new_etcd_image }}"

+ 23 - 0
playbooks/common/openshift-cluster/upgrades/etcd/fedora_tasks.yml

@@ -0,0 +1,23 @@
+---
+# F23 GA'd with etcd 2.0, currently has 2.2 in updates
+# F24 GA'd with etcd-2.2, currently has 2.2 in updates
+# F25 Beta currently has etcd 3.0
+- name: Verify cluster is healthy pre-upgrade
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+
+- name: Update etcd
+  package:
+    name: "etcd"
+    state: "latest"
+
+- name: Restart etcd
+  service:
+    name: etcd
+    state: restarted
+
+- name: Verify cluster is healthy
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+  register: etcdctl
+  until: etcdctl.rc == 0
+  retries: 3
+  delay: 10

+ 1 - 0
playbooks/common/openshift-cluster/upgrades/etcd/files/etcdctl.sh

@@ -0,0 +1 @@
+../roles/etcd/files/etcdctl.sh

+ 1 - 0
playbooks/common/openshift-cluster/upgrades/etcd/filter_plugins

@@ -0,0 +1 @@
+../../../../../filter_plugins

+ 1 - 0
playbooks/common/openshift-cluster/upgrades/etcd/lookup_plugins

@@ -0,0 +1 @@
+../../../../../lookup_plugins

+ 122 - 0
playbooks/common/openshift-cluster/upgrades/etcd/main.yml

@@ -0,0 +1,122 @@
+---
+# For 1.4/3.4 we want to upgrade everyone to etcd-3.0. etcd docs say to
+# upgrade from 2.0.x to 2.1.x to 2.2.x to 2.3.x to 3.0.x. While this is a tedius
+# task for RHEL and CENTOS it's simply not possible in Fedora unless you've
+# mirrored packages on your own because only the GA and latest versions are
+# available in the repos. So for Fedora we'll simply skip this, sorry.
+
+- include: ../../evaluate_groups.yml
+  tags:
+  - always
+
+- name: Evaluate additional groups for upgrade
+  hosts: localhost
+  connection: local
+  become: no
+  tasks:
+  - name: Evaluate etcd_hosts_to_upgrade
+    add_host:
+      name: "{{ item }}"
+      groups: etcd_hosts_to_upgrade, etcd_hosts_to_backup
+    with_items: "{{ groups.oo_etcd_to_config if groups.oo_etcd_to_config is defined and groups.oo_etcd_to_config | length > 0 else groups.oo_first_master }}"
+
+- name: Backup etcd before upgrading anything
+  include: backup.yml
+  vars:
+    backup_tag: "pre-upgrade-"
+
+- name: Drop etcdctl profiles
+  hosts: etcd_hosts_to_upgrade
+  tasks:
+  - include: roles/etcd/tasks/etcdctl.yml
+
+- name: Determine etcd version
+  hosts: etcd_hosts_to_upgrade
+  tasks:
+  - name: Record RPM based etcd version
+    command: rpm -qa --qf '%{version}' etcd\*
+    register: etcd_installed_version
+    failed_when: false
+    when: not openshift.common.is_containerized | bool
+  - name: Record containerized etcd version
+    command: docker exec etcd_container rpm -qa --qf '%{version}' etcd\*
+    register: etcd_installed_version
+    failed_when: false
+    when: openshift.common.is_containerized | bool
+
+# I really dislike this copy/pasta but I wasn't able to find a way to get it to loop
+# through hosts, then loop through tasks only when appropriate
+- name: Upgrade to 2.1
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '2.1'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('2.1','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade RPM hosts to 2.2
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '2.2'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('2.2','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade containerized hosts to 2.2.5
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: 2.2.5
+  tasks:
+  - include: containerized_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('2.2','<') and openshift.common.is_containerized | bool
+
+- name: Upgrade RPM hosts to 2.3
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '2.3'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('2.3','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade containerized hosts to 2.3.7
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: 2.3.7
+  tasks:
+  - include: containerized_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('2.3','<') and openshift.common.is_containerized | bool
+
+- name: Upgrade RPM hosts to 3.0
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: '3.0'
+  tasks:
+  - include: rhel_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('3.0','<') and ansible_distribution == 'RedHat' and not openshift.common.is_containerized | bool
+
+- name: Upgrade containerized hosts to etcd3 image
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  vars:
+    upgrade_version: 3.0.3
+  tasks:
+  - include: containerized_tasks.yml
+    when: etcd_installed_version.stdout | default('99') | version_compare('3.0','<') and openshift.common.is_containerized | bool
+
+- name: Upgrade fedora to latest
+  hosts: etcd_hosts_to_upgrade
+  serial: 1
+  tasks:
+  - include: fedora_tasks.yml
+    when: ansible_distribution == 'Fedora' and not openshift.common.is_containerized | bool
+
+- name: Backup etcd
+  include: backup.yml
+  vars:
+    backup_tag: "post-3.0-"

+ 23 - 0
playbooks/common/openshift-cluster/upgrades/etcd/rhel_tasks.yml

@@ -0,0 +1,23 @@
+---
+- name: Verify cluster is healthy pre-upgrade
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+
+- name: Update etcd package but exclude etcd3
+  command: "{{ ansible_pkg_mgr }} install -y etcd-{{ upgrade_version }}\\* --exclude etcd3"
+  when: upgrade_version | version_compare('3.0','<')
+
+- name: Update etcd package not excluding etcd3
+  command: "{{ ansible_pkg_mgr }} install -y etcd3-{{ upgrade_version }}\\*"
+  when: not upgrade_version | version_compare('3.0','<')
+
+- name: Restart etcd
+  service:
+    name: etcd
+    state: restarted
+
+- name: Verify cluster is healthy
+  command: "etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://{{ openshift.common.hostname }}:2379 cluster-health"
+  register: etcdctl
+  until: etcdctl.rc == 0
+  retries: 3
+  delay: 10

+ 1 - 0
playbooks/common/openshift-cluster/upgrades/etcd/roles

@@ -0,0 +1 @@
+../../../../../roles

+ 1 - 73
playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml

@@ -28,79 +28,7 @@
         debug_level: "{{ openshift_master_debug_level | default(openshift.common.debug_level | default(2)) }}"
 
 - name: Backup etcd
-  hosts: etcd_hosts_to_backup
-  vars:
-    embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
-    timestamp: "{{ lookup('pipe', 'date +%Y%m%d%H%M%S') }}"
-  roles:
-  - openshift_facts
-  tasks:
-  # Ensure we persist the etcd role for this host in openshift_facts
-  - openshift_facts:
-      role: etcd
-      local_facts: {}
-    when: "'etcd' not in openshift"
-
-  - stat: path=/var/lib/openshift
-    register: var_lib_openshift
-
-  - stat: path=/var/lib/origin
-    register: var_lib_origin
-
-  - name: Create origin symlink if necessary
-    file: src=/var/lib/openshift/ dest=/var/lib/origin state=link
-    when: var_lib_openshift.stat.exists == True and var_lib_origin.stat.exists == False
-
-  # TODO: replace shell module with command and update later checks
-  # We assume to be using the data dir for all backups.
-  - name: Check available disk space for etcd backup
-    shell: df --output=avail -k {{ openshift.common.data_dir }} | tail -n 1
-    register: avail_disk
-
-  # TODO: replace shell module with command and update later checks
-  - name: Check current embedded etcd disk usage
-    shell: du -k {{ openshift.etcd.etcd_data_dir }} | tail -n 1 | cut -f1
-    register: etcd_disk_usage
-    when: embedded_etcd | bool
-
-  - name: Abort if insufficient disk space for etcd backup
-    fail:
-      msg: >
-        {{ etcd_disk_usage.stdout }} Kb disk space required for etcd backup,
-        {{ avail_disk.stdout }} Kb available.
-    when: (embedded_etcd | bool) and (etcd_disk_usage.stdout|int > avail_disk.stdout|int)
-
-  - name: Install etcd (for etcdctl)
-    action: "{{ ansible_pkg_mgr }} name=etcd state=installed"
-    when: not openshift.common.is_atomic | bool
-
-  - name: Generate etcd backup
-    command: >
-      etcdctl backup --data-dir={{ openshift.etcd.etcd_data_dir }}
-      --backup-dir={{ openshift.common.data_dir }}/etcd-backup-{{ timestamp }}
-
-  - set_fact:
-      etcd_backup_complete: True
-
-  - name: Display location of etcd backup
-    debug:
-      msg: "Etcd backup created in {{ openshift.common.data_dir }}/etcd-backup-{{ timestamp }}"
-
-
-- name: Gate on etcd backup
-  hosts: localhost
-  connection: local
-  become: no
-  tasks:
-  - set_fact:
-      etcd_backup_completed: "{{ hostvars
-                                 | oo_select_keys(groups.etcd_hosts_to_backup)
-                                 | oo_collect('inventory_hostname', {'etcd_backup_complete': true}) }}"
-  - set_fact:
-      etcd_backup_failed: "{{ groups.etcd_hosts_to_backup | difference(etcd_backup_completed) }}"
-  - fail:
-      msg: "Upgrade cannot continue. The following hosts did not complete etcd backup: {{ etcd_backup_failed | join(',') }}"
-    when: etcd_backup_failed | length > 0
+  include: ./etcd/backup.yml
 
 - name: Upgrade master packages
   hosts: oo_masters_to_config

+ 11 - 0
roles/etcd/etcdctl.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+# Sets up handy aliases for etcd, need etcdctl2 and etcdctl3 because
+# command flags are different between the two. Should work on stand
+# alone etcd hosts and master + etcd hosts too because we use the peer keys.
+etcdctl2() {
+ /usr/bin/etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://`hostname`:2379 ${@}
+}
+
+etcdctl3() {
+ ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints https://`hostname`:2379 ${@}
+}

+ 11 - 0
roles/etcd/files/etcdctl.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+# Sets up handy aliases for etcd, need etcdctl2 and etcdctl3 because
+# command flags are different between the two. Should work on stand
+# alone etcd hosts and master + etcd hosts too because we use the peer keys.
+etcdctl2() {
+ /usr/bin/etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt -C https://`hostname`:2379 ${@}
+}
+
+etcdctl3() {
+ ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints https://`hostname`:2379 ${@}
+}

+ 11 - 0
roles/etcd/tasks/etcdctl.yml

@@ -0,0 +1,11 @@
+- name: Install etcd for etcdctl
+  action: "{{ ansible_pkg_mgr }} name=etcd state=present"
+  when: not openshift.common.is_atomic | bool
+
+- name: Configure etcd profile.d alises
+  copy:
+    src: etcdctl.sh
+    dest: /etc/profile.d/etcdctl.sh
+    mode: 0755
+    owner: root
+    group: root

+ 3 - 0
roles/etcd/tasks/main.yml

@@ -74,5 +74,8 @@
     enabled: yes
   register: start_result
 
+- include: etcdctl.yml
+  when: openshift_etcd_etcdctl_profile | default(true) | bool
+
 - set_fact:
     etcd_service_status_changed: "{{ start_result | changed }}"

+ 3 - 3
roles/openshift_facts/library/openshift_facts.py

@@ -1595,7 +1595,7 @@ def set_container_facts_if_unset(facts):
         cli_image = master_image
         node_image = 'openshift3/node'
         ovs_image = 'openshift3/openvswitch'
-        etcd_image = 'registry.access.redhat.com/rhel7/etcd'
+        etcd_image = 'registry.access.redhat.com/rhel7/etcd3'
         pod_image = 'openshift3/ose-pod'
         router_image = 'openshift3/ose-haproxy-router'
         registry_image = 'openshift3/ose-docker-registry'
@@ -1605,7 +1605,7 @@ def set_container_facts_if_unset(facts):
         cli_image = master_image
         node_image = 'aep3_beta/node'
         ovs_image = 'aep3_beta/openvswitch'
-        etcd_image = 'registry.access.redhat.com/rhel7/etcd'
+        etcd_image = 'registry.access.redhat.com/rhel7/etcd3'
         pod_image = 'aep3_beta/aep-pod'
         router_image = 'aep3_beta/aep-haproxy-router'
         registry_image = 'aep3_beta/aep-docker-registry'
@@ -1615,7 +1615,7 @@ def set_container_facts_if_unset(facts):
         cli_image = master_image
         node_image = 'openshift/node'
         ovs_image = 'openshift/openvswitch'
-        etcd_image = 'registry.access.redhat.com/rhel7/etcd'
+        etcd_image = 'registry.access.redhat.com/rhel7/etcd3'
         pod_image = 'openshift/origin-pod'
         router_image = 'openshift/origin-haproxy-router'
         registry_image = 'openshift/origin-docker-registry'