Przeglądaj źródła

Switch to migrating one host and forming a new cluster

With large datasets where there are many keys with TTLs the expiry was
creating a data inconsistency problem. The hope is that by performing
the migration once and then forming a new cluster this is avoided.

Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1475351
Scott Dodson 7 lat temu
rodzic
commit
4b5d8d2dc2

+ 55 - 10
playbooks/common/openshift-etcd/migrate.yml

@@ -17,18 +17,14 @@
   tags:
   - always
 
+# TODO: This will be different for release-3.6 branch
 - name: Prepare masters for etcd data migration
   hosts: oo_masters_to_config
   tasks:
   - set_fact:
       master_services:
-      - "{{ openshift.common.service_type + '-master' }}"
-  - set_fact:
-      master_services:
       - "{{ openshift.common.service_type + '-master-controllers' }}"
       - "{{ openshift.common.service_type + '-master-api' }}"
-    when:
-    - (openshift_master_cluster_method is defined and openshift_master_cluster_method == "native") or openshift.common.is_master_system_container | bool
   - debug:
       msg: "master service name: {{ master_services }}"
   - name: Stop masters
@@ -67,16 +63,59 @@
     when:
     - etcd_backup_failed | length > 0
 
-- name: Migrate etcd data from v2 to v3
+- name: Stop etcd
   hosts: oo_etcd_to_migrate
   gather_facts: no
   tags:
   - always
+  pre_tasks:
+  - set_fact:
+      l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}"
+  - name: Disable etcd members
+    service:
+      name: "{{ l_etcd_service }}"
+      state: stopped
+
+- name: Migrate data on first etcd
+  hosts: oo_etcd_to_migrate[0]
+  gather_facts: no
+  tags:
+  - always
   roles:
   - role: etcd_migrate
     r_etcd_migrate_action: migrate
     r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
     etcd_peer: "{{ ansible_default_ipv4.address }}"
+    etcd_url_scheme: "https"
+    etcd_peer_url_scheme: "https"
+
+- name: Clean data stores on remaining etcd hosts
+  hosts: oo_etcd_to_migrate[1:]
+  gather_facts: no
+  tags:
+  - always
+  roles:
+  - role: etcd_migrate
+    r_etcd_migrate_action: clean_data
+    r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
+    etcd_peer: "{{ ansible_default_ipv4.address }}"
+    etcd_url_scheme: "https"
+    etcd_peer_url_scheme: "https"
+  post_tasks:
+  - name: Add etcd hosts
+    delegate_to: localhost
+    add_host:
+      name: "{{ item }}"
+      groups: oo_new_etcd_to_config
+      ansible_ssh_user: "{{ g_ssh_user | default(omit) }}"
+      ansible_become: "{{ g_sudo | default(omit) }}"
+    with_items: "{{ groups.oo_etcd_to_migrate[1:] | default([]) }}"
+    changed_when: no
+  - name: Set success
+    set_fact:
+      r_etcd_migrate_success: true
+
+- include: ./scaleup.yml
 
 - name: Gate on etcd migration
   hosts: oo_masters_to_config
@@ -89,6 +128,16 @@
   - set_fact:
       etcd_migration_failed: "{{ groups.oo_etcd_to_migrate | difference(etcd_migration_completed) }}"
 
+- name: Add TTLs on the first master
+  hosts: oo_first_master[0]
+  roles:
+  - role: etcd_migrate
+    r_etcd_migrate_action: add_ttls
+    etcd_peer: "{{ hostvars[groups.oo_etcd_to_migrate.0].ansible_default_ipv4.address }}"
+    etcd_url_scheme: "https"
+    etcd_peer_url_scheme: "https"
+    when: etcd_migration_failed | length == 0
+
 - name: Configure masters if etcd data migration is succesfull
   hosts: oo_masters_to_config
   roles:
@@ -100,10 +149,6 @@
       msg: "Skipping master re-configuration since migration failed."
     when:
     - etcd_migration_failed | length > 0
-
-- name: Start masters after etcd data migration
-  hosts: oo_masters_to_config
-  tasks:
   - name: Start master services
     service:
       name: "{{ item }}"

+ 13 - 0
playbooks/common/openshift-etcd/scaleup.yml

@@ -24,6 +24,9 @@
                        member add {{ etcd_hostname }} {{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }}
     delegate_to: "{{ etcd_ca_host }}"
     register: etcd_add_check
+    retries: 3
+    delay: 10
+    until: etcd_add_check.rc == 0
   roles:
   - role: openshift_etcd
     when: etcd_add_check.rc == 0
@@ -36,3 +39,13 @@
     r_etcd_common_etcd_runtime: "{{ openshift.common.etcd_runtime }}"
   - role: nickhammond.logrotate
     when: etcd_add_check.rc == 0
+  post_tasks:
+  - name: Verify cluster is stable
+    command: >
+      /usr/bin/etcdctl --cert-file {{ etcd_peer_cert_file }}
+                       --key-file {{ etcd_peer_key_file }}
+                       --ca-file {{ etcd_peer_ca_file }}
+                       -C {{ etcd_peer_url_scheme }}://{{ hostvars[etcd_ca_host].etcd_hostname }}:{{ etcd_client_port }}
+                       cluster-health
+    retries: 1
+    delay: 30

+ 10 - 0
roles/etcd_common/defaults/main.yml

@@ -63,3 +63,13 @@ etcd_client_port: 2379
 etcd_peer_port: 2380
 etcd_url_scheme: http
 etcd_peer_url_scheme: http
+
+etcd_initial_cluster_state: new
+etcd_initial_cluster_token: etcd-cluster-1
+
+etcd_initial_advertise_peer_urls: "{{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }}"
+etcd_listen_peer_urls: "{{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }}"
+etcd_advertise_client_urls: "{{ etcd_url_scheme }}://{{ etcd_ip }}:{{ etcd_client_port }}"
+etcd_listen_client_urls: "{{ etcd_url_scheme }}://{{ etcd_ip }}:{{ etcd_client_port }}"
+
+etcd_systemd_dir: "/etc/systemd/system/{{ etcd_service }}.service.d"

+ 33 - 0
roles/etcd_migrate/tasks/add_ttls.yml

@@ -0,0 +1,33 @@
+---
+# To be executed on first master
+- slurp:
+    src: "{{ openshift.common.config_base }}/master/master-config.yaml"
+  register: g_master_config_output
+
+- set_fact:
+    accessTokenMaxAgeSeconds: "{{ (g_master_config_output.content|b64decode|from_yaml).oauthConfig.tokenConfig.accessTokenMaxAgeSeconds | default(86400) }}"
+    authroizeTokenMaxAgeSeconds: "{{ (g_master_config_output.content|b64decode|from_yaml).oauthConfig.tokenConfig.authroizeTokenMaxAgeSeconds | default(500) }}"
+    controllerLeaseTTL: "{{ (g_master_config_output.content|b64decode|from_yaml).controllerLeaseTTL | default(30) }}"
+- name: Re-introduce leases (as a replacement for key TTLs)
+  command: >
+    oadm migrate etcd-ttl \
+    --cert {{ r_etcd_common_master_peer_cert_file }} \
+    --key {{ r_etcd_common_master_peer_key_file }} \
+    --cacert {{ r_etcd_common_master_peer_ca_file }} \
+    --etcd-address 'https://{{ etcd_peer }}:{{ etcd_client_port }}' \
+    --ttl-keys-prefix {{ item.keys }} \
+    --lease-duration {{ item.ttl }}
+  environment:
+    ETCDCTL_API: 3
+    PATH: "/usr/local/bin:/var/usrlocal/bin:{{ ansible_env.PATH }}"
+  with_items:
+    - keys: "/kubernetes.io/events"
+      ttl: "1h"
+    - keys: "/kubernetes.io/masterleases"
+      ttl: "10s"
+    - keys: "/openshift.io/oauth/accesstokens"
+      ttl: "{{ accessTokenMaxAgeSeconds }}s"
+    - keys: "/openshift.io/oauth/authorizetokens"
+      ttl: "{{ authroizeTokenMaxAgeSeconds }}s"
+    - keys: "/openshift.io/leases/controllers"
+      ttl: "{{ controllerLeaseTTL }}s"

+ 0 - 3
roles/etcd_migrate/tasks/check.yml

@@ -1,7 +1,4 @@
 ---
-- fail:
-    msg: "Currently etcd v3 migration is unsupported while we test it more thoroughly"
-  when: not openshift_enable_unsupported_configurations | default(false) | bool
 
 # Check the cluster is healthy
 - include: check_cluster_health.yml

+ 5 - 0
roles/etcd_migrate/tasks/clean_data.yml

@@ -0,0 +1,5 @@
+---
+- name: Remove member data
+  file:
+    path: /var/lib/etcd/member
+    state: absent

+ 2 - 2
roles/etcd_migrate/tasks/main.yml

@@ -1,8 +1,8 @@
 ---
 - name: Fail if invalid r_etcd_migrate_action provided
   fail:
-    msg: "etcd_migrate role can only be called with 'check' or 'migrate' or 'configure'"
-  when: r_etcd_migrate_action not in ['check', 'migrate', 'configure']
+    msg: "etcd_migrate role can only be called with 'check', 'migrate', 'configure', 'add_ttls', or 'clean_data'"
+  when: r_etcd_migrate_action not in ['check', 'migrate', 'configure', 'add_ttls', 'clean_data']
 
 - name: Include main action task file
   include: "{{ r_etcd_migrate_action }}.yml"

+ 16 - 33
roles/etcd_migrate/tasks/migrate.yml

@@ -3,62 +3,45 @@
 - set_fact:
     l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}"
 
-- name: Disable etcd members
-  service:
-    name: "{{ l_etcd_service }}"
-    state: stopped
-
-# Should we skip all TTL keys? https://bugzilla.redhat.com/show_bug.cgi?id=1389773
 - name: Migrate etcd data
   command: >
     etcdctl migrate --data-dir={{ etcd_data_dir }}
   environment:
     ETCDCTL_API: 3
   register: l_etcdctl_migrate
-
 # TODO(jchaloup): If any of the members fails, we need to restore all members to v2 from the pre-migrate backup
 - name: Check the etcd v2 data are correctly migrated
   fail:
     msg: "Failed to migrate a member"
   when: "'finished transforming keys' not in l_etcdctl_migrate.stdout and 'no v2 keys to migrate' not in l_etcdctl_migrate.stdout"
-
 - name: Migration message
   debug:
     msg: "Etcd migration finished with: {{ l_etcdctl_migrate.stdout }}"
-
-- name: Enable etcd member
-  service:
+- name: Set ETCD_FORCE_NEW_CLUSTER=true on first etcd host
+  lineinfile:
+    line: "ETCD_FORCE_NEW_CLUSTER=true"
+    dest: /etc/etcd/etcd.conf
+- name: Start etcd
+  systemd:
     name: "{{ l_etcd_service }}"
     state: started
+- name: Unset ETCD_FORCE_NEW_CLUSTER=true on first etcd host
+  lineinfile:
+    line: "ETCD_FORCE_NEW_CLUSTER=true"
+    dest: /etc/etcd/etcd.conf
+    state: absent
+- name: Restart first etcd host
+  systemd:
+    name: "{{ l_etcd_service }}"
+    state: restarted
 
-- name: Wait for cluster to become healthy after migration
+- name: Wait for cluster to become healthy after bringing up first member
   command: >
     etcdctl --cert-file {{ etcd_peer_cert_file }} --key-file {{ etcd_peer_key_file }} --ca-file {{ etcd_peer_ca_file }} --endpoint https://{{ etcd_peer }}:{{ etcd_client_port }} cluster-health
   register: l_etcd_migrate_health
   until: l_etcd_migrate_health.rc == 0
   retries: 3
   delay: 30
-  run_once: true
-
-# NOTE: /usr/local/bin may be removed from the PATH by ansible hence why
-#       it's added to the environment in this task.
-- name: Re-introduce leases (as a replacement for key TTLs)
-  command: >
-    oadm migrate etcd-ttl \
-    --cert {{ r_etcd_common_master_peer_cert_file }} \
-    --key {{ r_etcd_common_master_peer_key_file }} \
-    --cacert {{ r_etcd_common_master_peer_ca_file }} \
-    --etcd-address 'https://{{ etcd_peer }}:{{ etcd_client_port }}' \
-    --ttl-keys-prefix {{ item }} \
-    --lease-duration 1h
-  environment:
-    ETCDCTL_API: 3
-    PATH: "/usr/local/bin:/var/usrlocal/bin:{{ ansible_env.PATH }}"
-  with_items:
-  - "/kubernetes.io/events"
-  - "/kubernetes.io/masterleases"
-  delegate_to: "{{ groups.oo_first_master[0] }}"
-  run_once: true
 
 - set_fact:
     r_etcd_migrate_success: true