Просмотр исходного кода

Merge pull request #1121 from abutcher/rolling-restarts-pacemaker

Rolling restart playbook for masters
Brenton Leanhardt 9 лет назад
Родитель
Сommit
965c614859

+ 27 - 1
filter_plugins/openshift_master.py

@@ -463,6 +463,32 @@ class FilterModule(object):
         IdentityProviderBase.validate_idp_list(idp_list)
         return yaml.safe_dump([idp.to_dict() for idp in idp_list], default_flow_style=False)
 
+    @staticmethod
+    def validate_pcs_cluster(data, masters=None):
+        ''' Validates output from "pcs status", ensuring that each master
+            provided is online.
+            Ex: data = ('...',
+                        'PCSD Status:',
+                        'master1.example.com: Online',
+                        'master2.example.com: Online',
+                        'master3.example.com: Online',
+                        '...')
+                masters = ['master1.example.com',
+                           'master2.example.com',
+                           'master3.example.com']
+               returns True
+        '''
+        if not issubclass(type(data), str):
+            raise errors.AnsibleFilterError("|failed expects data is a string")
+        if not issubclass(type(masters), list):
+            raise errors.AnsibleFilterError("|failed expects masters is a list")
+        valid = True
+        for master in masters:
+            if "{0}: Online".format(master) not in data:
+                valid = False
+        return valid
+
     def filters(self):
         ''' returns a mapping of filters to methods '''
-        return {"translate_idps": self.translate_idps}
+        return {"translate_idps": self.translate_idps,
+                "validate_pcs_cluster": self.validate_pcs_cluster}

+ 1 - 0
playbooks/byo/openshift-master/filter_plugins

@@ -0,0 +1 @@
+../../../filter_plugins

+ 1 - 0
playbooks/byo/openshift-master/lookup_plugins

@@ -0,0 +1 @@
+../../../lookup_plugins

+ 4 - 0
playbooks/byo/openshift-master/restart.yml

@@ -0,0 +1,4 @@
+---
+- include: ../../common/openshift-master/restart.yml
+  vars_files:
+  - ../../byo/openshift-cluster/cluster_hosts.yml

+ 1 - 0
playbooks/byo/openshift-master/roles

@@ -0,0 +1 @@
+../../../roles

+ 141 - 0
playbooks/common/openshift-master/restart.yml

@@ -0,0 +1,141 @@
+---
+- include: ../openshift-cluster/evaluate_groups.yml
+
+- name: Validate configuration for rolling restart
+  hosts: oo_masters_to_config
+  roles:
+  - openshift_facts
+  tasks:
+  - fail:
+      msg: "openshift_rolling_restart_mode must be set to either 'services' or 'system'"
+    when: openshift_rolling_restart_mode is defined and openshift_rolling_restart_mode not in ["services", "system"]
+  - openshift_facts:
+      role: "{{ item.role }}"
+      local_facts: "{{ item.local_facts }}"
+    with_items:
+      - role: common
+        local_facts:
+          rolling_restart_mode: "{{ openshift_rolling_restart_mode | default('services') }}"
+      - role: master
+        local_facts:
+          cluster_method: "{{ openshift_master_cluster_method | default(None) }}"
+
+# Creating a temp file on localhost, we then check each system that will
+# be rebooted to see if that file exists, if so we know we're running
+# ansible on a machine that needs a reboot, and we need to error out.
+- name: Create temp file on localhost
+  hosts: localhost
+  connection: local
+  become: no
+  gather_facts: no
+  tasks:
+  - local_action: command mktemp
+    register: mktemp
+    changed_when: false
+
+- name: Check if temp file exists on any masters
+  hosts: oo_masters_to_config
+  tasks:
+  - stat: path="{{ hostvars.localhost.mktemp.stdout }}"
+    register: exists
+    changed_when: false
+
+- name: Cleanup temp file on localhost
+  hosts: localhost
+  connection: local
+  become: no
+  gather_facts: no
+  tasks:
+  - file: path="{{ hostvars.localhost.mktemp.stdout }}" state=absent
+    changed_when: false
+
+- name: Warn if restarting the system where ansible is running
+  hosts: oo_masters_to_config
+  tasks:
+  - pause:
+      prompt: >
+        Warning: Running playbook from a host that will be restarted!
+        Press CTRL+C and A to abort playbook execution. You may
+        continue by pressing ENTER but the playbook will stop
+        executing once this system restarts and services must be
+        manually verified.
+    when: exists.stat.exists and openshift.common.rolling_restart_mode == 'system'
+  - set_fact:
+      current_host: "{{ exists.stat.exists }}"
+    when: openshift.common.rolling_restart_mode == 'system'
+
+- name: Determine which masters are currently active
+  hosts: oo_masters_to_config
+  tasks:
+  - name: Check master service status
+    command: >
+      systemctl is-active {{ openshift.common.service_type }}-master
+    register: active_check_output
+    when: openshift.master.cluster_method == 'pacemaker'
+    failed_when: active_check_output.stdout not in ['active', 'inactive']
+    changed_when: false
+  - set_fact:
+      is_active: "{{ active_check_output.stdout == 'active' }}"
+    when: openshift.master.cluster_method == 'pacemaker'
+
+- name: Evaluate master groups
+  hosts: localhost
+  become: no
+  tasks:
+  - name: Evaluate oo_active_masters
+    add_host:
+      name: "{{ item }}"
+      groups: oo_active_masters
+      ansible_ssh_user: "{{ g_ssh_user | default(omit) }}"
+      ansible_sudo: "{{ g_sudo | default(omit) }}"
+    with_items: "{{ groups.oo_masters_to_config | default([]) }}"
+    when: (hostvars[item]['is_active'] | default(false)) | bool
+  - name: Evaluate oo_current_masters
+    add_host:
+      name: "{{ item }}"
+      groups: oo_current_masters
+      ansible_ssh_user: "{{ g_ssh_user | default(omit) }}"
+      ansible_sudo: "{{ g_sudo | default(omit) }}"
+    with_items: "{{ groups.oo_masters_to_config | default([]) }}"
+    when: (hostvars[item]['current_host'] | default(false)) | bool
+
+- name: Validate pacemaker cluster
+  hosts: oo_active_masters
+  tasks:
+  - name: Retrieve pcs status
+    command: pcs status
+    register: pcs_status_output
+    changed_when: false
+  - fail:
+      msg: >
+        Pacemaker cluster validation failed. One or more nodes are not online.
+    when: not (pcs_status_output.stdout | validate_pcs_cluster(groups.oo_masters_to_config)) | bool
+
+- name: Restart masters
+  hosts: oo_masters_to_config:!oo_active_masters:!oo_current_masters
+  vars:
+    openshift_master_ha: "{{ groups.oo_masters_to_config | length > 1 }}"
+  serial: 1
+  tasks:
+  - include: restart_hosts.yml
+    when: openshift.common.rolling_restart_mode == 'system'
+  - include: restart_services.yml
+    when: openshift.common.rolling_restart_mode == 'services'
+
+- name: Restart active masters
+  hosts: oo_active_masters
+  serial: 1
+  tasks:
+  - include: restart_hosts_pacemaker.yml
+    when: openshift.common.rolling_restart_mode == 'system'
+  - include: restart_services_pacemaker.yml
+    when: openshift.common.rolling_restart_mode == 'services'
+
+- name: Restart current masters
+  hosts: oo_current_masters
+  serial: 1
+  tasks:
+  - include: restart_hosts.yml
+    when: openshift.common.rolling_restart_mode == 'system'
+  - include: restart_services.yml
+    when: openshift.common.rolling_restart_mode == 'services'

+ 39 - 0
playbooks/common/openshift-master/restart_hosts.yml

@@ -0,0 +1,39 @@
+- name: Restart master system
+  # https://github.com/ansible/ansible/issues/10616
+  shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart"
+  async: 1
+  poll: 0
+  ignore_errors: true
+  become: yes
+# When cluster_method != pacemaker we can ensure the api_port is
+# available.
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+  when: openshift.master.cluster_method != 'pacemaker'
+- name: Wait for master to start
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port=22
+  when: openshift.master.cluster_method == 'pacemaker'
+- name: Wait for master to become available
+  command: pcs status
+  register: pcs_status_output
+  until: pcs_status_output.stdout | validate_pcs_cluster([inventory_hostname]) | bool
+  retries: 15
+  delay: 2
+  changed_when: false
+  when: openshift.master.cluster_method == 'pacemaker'
+- fail:
+    msg: >
+      Pacemaker cluster validation failed {{ inventory hostname }} is not online.
+  when: openshift.master.cluster_method == 'pacemaker' and not (pcs_status_output.stdout | validate_pcs_cluster([inventory_hostname])) | bool

+ 25 - 0
playbooks/common/openshift-master/restart_hosts_pacemaker.yml

@@ -0,0 +1,25 @@
+- name: Fail over master resource
+  command: >
+    pcs resource move master {{ hostvars | oo_select_keys(groups['oo_masters_to_config']) | oo_collect('openshift.common.hostname', {'is_active': 'False'}) | list | first }}
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ openshift.master.cluster_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+- name: Restart master system
+  # https://github.com/ansible/ansible/issues/10616
+  shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart"
+  async: 1
+  poll: 0
+  ignore_errors: true
+  become: yes
+- name: Wait for master to start
+  become: no
+  local_action:
+   module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10

+ 27 - 0
playbooks/common/openshift-master/restart_services.yml

@@ -0,0 +1,27 @@
+- name: Restart master
+  service:
+    name: "{{ openshift.common.service_type }}-master"
+    state: restarted
+  when: not openshift_master_ha | bool
+- name: Restart master API
+  service:
+    name: "{{ openshift.common.service_type }}-master-api"
+    state: restarted
+  when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker'
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+  when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker'
+- name: Restart master controllers
+  service:
+    name: "{{ openshift.common.service_type }}-master-controllers"
+    state: restarted
+  # Ignore errrors since it is possible that type != simple for
+  # pre-3.1.1 installations.
+  ignore_errors: true
+  when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker'

+ 10 - 0
playbooks/common/openshift-master/restart_services_pacemaker.yml

@@ -0,0 +1,10 @@
+- name: Restart master services
+  command: pcs resource restart master
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ openshift.master.cluster_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"

+ 1 - 0
roles/openshift_facts/tasks/main.yml

@@ -10,6 +10,7 @@
   shell: ls /run/ostree-booted
   ignore_errors: yes
   failed_when: false
+  changed_when: false
   register: ostree_output
 
 # Locally setup containerized facts for now