Jelajahi Sumber

Merge remote-tracking branch 'origin/upgradeFix2' into upgradeFix2

Jason DeTiberus 9 tahun lalu
induk
melakukan
8b3006399a

+ 185 - 0
playbooks/adhoc/upgrades/files/pre-upgrade-check

@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+"""
+Pre-upgrade checks that must be run on a master before proceeding with upgrade.
+"""
+# This is a script not a python module:
+# pylint: disable=invalid-name
+
+# NOTE: This script should not require any python libs other than what is
+# in the standard library.
+
+__license__ = "ASL 2.0"
+
+import json
+import os
+import subprocess
+import re
+
+# The maximum length of container.ports.name
+ALLOWED_LENGTH = 15
+# The valid structure of container.ports.name
+ALLOWED_CHARS = re.compile('^[a-z0-9][a-z0-9\\-]*[a-z0-9]$')
+AT_LEAST_ONE_LETTER = re.compile('[a-z]')
+# look at OS_PATH for the full path. Default ot 'oc'
+OC_PATH = os.getenv('OC_PATH', 'oc')
+
+
+def validate(value):
+    """
+    validate verifies that value matches required conventions
+
+    Rules of container.ports.name validation:
+
+    * must be less that 16 chars
+    * at least one letter
+    * only a-z0-9-
+    * hyphens can not be leading or trailing or next to each other
+
+    :Parameters:
+       - `value`: Value to validate
+    """
+    if len(value) > ALLOWED_LENGTH:
+        return False
+
+    if '--' in value:
+        return False
+
+    # We search since it can be anywhere
+    if not AT_LEAST_ONE_LETTER.search(value):
+        return False
+
+    # We match because it must start at the beginning
+    if not ALLOWED_CHARS.match(value):
+        return False
+    return True
+
+
+def list_items(kind):
+    """
+    list_items returns a list of items from the api
+
+    :Parameters:
+       - `kind`: Kind of item to access
+    """
+    response = subprocess.check_output([OC_PATH, 'get', '--all-namespaces', '-o', 'json', kind])
+    items = json.loads(response)
+    return items.get("items", [])
+
+
+def get(obj, *paths):
+    """
+    Gets an object
+
+    :Parameters:
+       - `obj`: A dictionary structure
+       - `path`: All other non-keyword arguments
+    """
+    ret_obj = obj
+    for path in paths:
+        if ret_obj.get(path, None) is None:
+            return []
+        ret_obj = ret_obj[path]
+    return ret_obj
+
+
+# pylint: disable=too-many-arguments
+def pretty_print_errors(namespace, kind, item_name, container_name, port_name, valid):
+    """
+    Prints out results in human friendly way.
+
+    :Parameters:
+       - `namespace`: Namespace of the resource
+       - `kind`: Kind of the resource
+       - `item_name`: Name of the resource
+       - `container_name`: Name of the container. May be "" when kind=Service.
+       - `port_name`: Name of the port
+       - `valid`: True if the port is valid
+    """
+    if not valid:
+        if len(container_name) > 0:
+            print('%s/%s -n %s (Container="%s" Port="%s")' % (
+                kind, item_name, namespace, container_name, port_name))
+        else:
+            print('%s/%s -n %s (Port="%s")' % (
+                kind, item_name, namespace, port_name))
+
+
+def print_validation_header():
+    """
+    Prints the error header. Should run on the first error to avoid
+    overwhelming the user.
+    """
+    print """\
+At least one port name does not validate. Valid port names:
+
+    * must be less that 16 chars
+    * have at least one letter
+    * only a-z0-9-
+    * do not start or end with -
+    * Dashes may not be next to eachother ('--')
+"""
+
+
+def main():
+    """
+    main is the main entry point to this script
+    """
+    try:
+        # the comma at the end suppresses the newline
+        print "Checking for oc ...",
+        subprocess.check_output([OC_PATH, 'whoami'])
+        print "found"
+    except:
+        print(
+            'Can not find oc (%s). Override the path with the '
+            'OC_PATH environment variable. Exiting...' % OC_PATH)
+        raise SystemExit(1)
+
+    # Where the magic happens
+    first_error = True
+    for kind, path in [
+            ('replicationcontrollers', ("spec", "template", "spec", "containers")),
+            ('pods', ("spec", "containers")),
+            ('deploymentconfigs', ("spec", "template", "spec", "containers"))]:
+        for item in list_items(kind):
+            namespace = item["metadata"]["namespace"]
+            item_name = item["metadata"]["name"]
+            for container in get(item, *path):
+                container_name = container["name"]
+                for port in get(container, "ports"):
+                    port_name = port.get("name", None)
+                    if not port_name:
+                        # Unnamed ports are OK
+                        continue
+                    valid = validate(port_name)
+                    if not valid and first_error:
+                        first_error = False
+                        print_validation_header()
+                    pretty_print_errors(
+                        namespace, kind, item_name,
+                        container_name, port_name, valid)
+
+    # Services follow a different flow
+    for item in list_items('services'):
+        namespace = item["metadata"]["namespace"]
+        item_name = item["metadata"]["name"]
+        for port in get(item, "spec", "ports"):
+            port_name = port.get("targetPort", None)
+            if isinstance(port_name, int) or port_name is None:
+                # Integer only or unnamed ports are OK
+                continue
+            valid = validate(port_name)
+            if not valid and first_error:
+                first_error = False
+                print_validation_header()
+            pretty_print_errors(
+                namespace, "services", item_name, "", port_name, valid)
+
+    # If we had at least 1 error then exit with 1
+    if not first_error:
+        raise SystemExit(1)
+
+
+if __name__ == '__main__':
+    main()
+

+ 58 - 3
playbooks/adhoc/upgrades/upgrade.yml

@@ -6,12 +6,24 @@
 
 - name: Verify upgrade can proceed
   hosts: masters[0]
+  vars:
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
   gather_facts: no
   tasks:
     # Checking the global deployment type rather than host facts, this is about
     # what the user is requesting.
     - fail: msg="Deployment type enterprise not supported for upgrade"
       when: deployment_type == "enterprise"
+    # Pacemaker is currently the only supported upgrade path for multiple masters
+    - fail: msg="openshift_master_cluster_method must be set to 'pacemaker'"
+      when: openshift_master_ha | bool and ((openshift_master_cluster_method is not defined) or (openshift_master_cluster_method is defined and openshift_master_cluster_method != "pacemaker"))
+
+- name: Run pre-upgrade checks on first master
+  hosts: masters[0]
+  tasks:
+  # If this script errors out ansible will show the default stdout/stderr
+  # which contains details for the user:
+  - script: files/pre-upgrade-check
 
 - name: Evaluate etcd_hosts
   hosts: localhost
@@ -182,8 +194,6 @@
     command: >
       tar -czvf {{ master_generated_certs_dir }}/{{ item.master_cert_subdir }}.tgz
         -C {{ master_generated_certs_dir }}/{{ item.master_cert_subdir }} .
-    args:
-      creates: "{{ master_generated_certs_dir }}/{{ item.master_cert_subdir }}.tgz"
     with_items: masters_needing_certs
 
   - name: Retrieve the master cert tarball from the master
@@ -195,11 +205,11 @@
       validate_checksum: yes
     with_items: masters_needing_certs
 
-
 - name: Sync certs and restart masters post configuration change
   hosts: masters
   vars:
     sync_tmpdir: "{{ hostvars.localhost.g_master_mktemp.stdout }}"
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
   tasks:
   - name: Unarchive the tarball on the master
     unarchive:
@@ -209,7 +219,41 @@
 
   - name: Restart master services
     service: name="{{ openshift.common.service_type}}-master" state=restarted
+    when: not openshift_master_ha | bool
+
+- name: Destroy cluster
+  hosts: masters[0]
+  vars:
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
+    openshift_deployment_type: "{{ deployment_type }}"
+  pre_tasks:
+  - name: Check for configured cluster
+    stat:
+      path: /etc/corosync/corosync.conf
+    register: corosync_conf
+    when: openshift_master_ha | bool
+  - name: Destroy cluster
+    command: pcs cluster destroy --all
+    when: openshift_master_ha | bool and corosync_conf.stat.exists == true
+
+- name: Start pcsd on masters
+  hosts: masters
+  vars:
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
+  tasks:
+  - name: Start pcsd
+    service: name=pcsd state=started
+    when: openshift_master_ha | bool
 
+- name: Re-create cluster
+  hosts: masters[0]
+  vars:
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
+    openshift_deployment_type: "{{ deployment_type }}"
+    omc_cluster_hosts: "{{ groups.masters | join(' ') }}"
+  roles:
+  - role: openshift_master_cluster
+    when: openshift_master_ha | bool
 
 - name: Delete temporary directory on localhost
   hosts: localhost
@@ -255,10 +299,21 @@
 
 - name: Restart masters post reconcile
   hosts: masters
+  vars:
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
   tasks:
     - name: Restart master services
       service: name="{{ openshift.common.service_type}}-master" state=restarted
+      when: not openshift_master_ha | bool
 
+- name: Restart cluster post reconcile
+  hosts: masters[0]
+  vars:
+    openshift_master_ha: "{{ groups['masters'] | length > 1 }}"
+  tasks:
+    - name: Restart master cluster
+      command: pcs resource restart master
+      when: openshift_master_ha | bool
 
 - name: Upgrade default router and registry
   hosts: masters[0]

+ 5 - 0
roles/openshift_master/tasks/main.yml

@@ -140,22 +140,27 @@
     src: atomic-openshift-master-api.service.j2
     dest: /usr/lib/systemd/system/{{ openshift.common.service_type }}-master-api.service
     force: no
+  when: openshift_master_ha | bool and openshift_master_cluster_method == "native"
 - name: Create the controllers service file
   template:
     src: atomic-openshift-master-controllers.service.j2
     dest: /usr/lib/systemd/system/{{ openshift.common.service_type }}-master-controllers.service
     force: no
+  when: openshift_master_ha | bool and openshift_master_cluster_method == "native"
 - name: Create the api env file
   template:
     src: atomic-openshift-master-api.j2
     dest: /etc/sysconfig/{{ openshift.common.service_type }}-master-api
     force: no
+  when: openshift_master_ha | bool and openshift_master_cluster_method == "native"
 - name: Create the controllers env file
   template:
     src: atomic-openshift-master-controllers.j2
     dest: /etc/sysconfig/{{ openshift.common.service_type }}-master-controllers
     force: no
+  when: openshift_master_ha | bool and openshift_master_cluster_method == "native"
 - command: systemctl daemon-reload
+  when: openshift_master_ha | bool and openshift_master_cluster_method == "native"
 # end workaround for missing systemd unit files
 
 - name: Create session secrets file

+ 3 - 1
utils/src/ooinstall/cli_installer.py

@@ -323,6 +323,8 @@ def get_installed_hosts(hosts, callback_facts):
             installed_hosts.append(host)
     return installed_hosts
 
+# pylint: disable=too-many-branches
+# This pylint error will be corrected shortly in separate PR.
 def get_hosts_to_run_on(oo_cfg, callback_facts, unattended, force, verbose):
 
     # Copy the list of existing hosts so we can remove any already installed nodes.
@@ -383,7 +385,7 @@ def get_hosts_to_run_on(oo_cfg, callback_facts, unattended, force, verbose):
 
                     openshift_ansible.set_config(oo_cfg)
                     click.echo('Gathering information from hosts...')
-                    callback_facts, error = openshift_ansible.default_facts(oo_cfg.hosts)
+                    callback_facts, error = openshift_ansible.default_facts(oo_cfg.hosts, verbose)
                     if error:
                         click.echo("There was a problem fetching the required information. " \
                                    "See {} for details.".format(oo_cfg.settings['ansible_log_path']))