Browse Source

Merge pull request #6335 from kwoodson/node_groups_refactor

Node group management update.
Scott Dodson 7 years ago
parent
commit
01ae634a0b

+ 2 - 2
playbooks/aws/openshift-cluster/accept.yml

@@ -18,7 +18,7 @@
       name: lib_openshift
 
   - name: fetch masters
-    ec2_remote_facts:
+    ec2_instance_facts:
       region: "{{ openshift_aws_region | default('us-east-1') }}"
       filters:
         "tag:clusterid": "{{ openshift_aws_clusterid | default('default') }}"
@@ -30,7 +30,7 @@
     until: "'instances' in mastersout and mastersout.instances|length > 0"
 
   - name: fetch new node instances
-    ec2_remote_facts:
+    ec2_instance_facts:
       region: "{{ openshift_aws_region | default('us-east-1') }}"
       filters:
         "tag:clusterid": "{{ openshift_aws_clusterid | default('default') }}"

+ 21 - 14
playbooks/common/openshift-cluster/upgrades/upgrade_scale_group.yml

@@ -11,25 +11,19 @@
       msg: "Ensure that new scale groups were provisioned before proceeding to update."
     when:
     - "'oo_sg_new_nodes' not in groups or groups.oo_sg_new_nodes|length == 0"
+    - "'oo_sg_current_nodes' not in groups or groups.oo_sg_current_nodes|length == 0"
+    - groups.oo_sg_current_nodes == groups.oo_sg_new_nodes
 
 - name: initialize upgrade bits
   import_playbook: init.yml
 
-- name: Drain and upgrade nodes
+- name: unschedule nodes
   hosts: oo_sg_current_nodes
-  # This var must be set with -e on invocation, as it is not a per-host inventory var
-  # and is evaluated early. Values such as "20%" can also be used.
-  serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
-  max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
-
-  pre_tasks:
+  tasks:
   - name: Load lib_openshift modules
-    include_role:
+    import_role:
       name: ../roles/lib_openshift
 
-  # TODO: To better handle re-trying failed upgrades, it would be nice to check if the node
-  # or docker actually needs an upgrade before proceeding. Perhaps best to save this until
-  # we merge upgrade functionality into the base roles and a normal config.yml playbook run.
   - name: Mark node unschedulable
     oc_adm_manage_node:
       node: "{{ openshift.node.nodename | lower }}"
@@ -40,14 +34,27 @@
     register: node_unschedulable
     until: node_unschedulable|succeeded
 
+- name: Drain nodes
+  hosts: oo_sg_current_nodes
+  # This var must be set with -e on invocation, as it is not a per-host inventory var
+  # and is evaluated early. Values such as "20%" can also be used.
+  serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
+  max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
+  tasks:
   - name: Drain Node for Kubelet upgrade
     command: >
-      {{ hostvars[groups.oo_first_master.0].openshift.common.client_binary }} adm drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
+      {{ hostvars[groups.oo_first_master.0].openshift.common.client_binary }} adm drain {{ openshift.node.nodename | lower }}
+      --config={{ openshift.common.config_base }}/master/admin.kubeconfig
+      --force --delete-local-data --ignore-daemonsets
+      --timeout={{ openshift_upgrade_nodes_drain_timeout | default(0) }}s
     delegate_to: "{{ groups.oo_first_master.0 }}"
     register: l_upgrade_nodes_drain_result
     until: not l_upgrade_nodes_drain_result | failed
-    retries: 60
-    delay: 60
+    retries: "{{ 1 if openshift_upgrade_nodes_drain_timeout | default(0) == '0' else 0  | int }}"
+    delay: 5
+    failed_when:
+    - l_upgrade_nodes_drain_result | failed
+    - openshift_upgrade_nodes_drain_timeout | default(0) == '0'
 
 # Alright, let's clean up!
 - name: clean up the old scale group

+ 15 - 26
roles/openshift_aws/defaults/main.yml

@@ -6,9 +6,7 @@ openshift_aws_create_security_groups: True
 openshift_aws_create_launch_config: True
 openshift_aws_create_scale_group: True
 
-openshift_aws_current_version: ''
-openshift_aws_new_version: ''
-
+openshift_aws_node_group_upgrade: False
 openshift_aws_wait_for_ssh: True
 
 openshift_aws_clusterid: default
@@ -19,7 +17,6 @@ openshift_aws_build_ami_group: "{{ openshift_aws_clusterid }}"
 openshift_aws_iam_cert_name: "{{ openshift_aws_clusterid }}-master-external"
 openshift_aws_iam_cert_path: ''
 openshift_aws_iam_cert_key_path: ''
-openshift_aws_scale_group_basename: "{{ openshift_aws_clusterid }} openshift"
 
 openshift_aws_iam_role_name: openshift_node_describe_instances
 openshift_aws_iam_role_policy_json: "{{ lookup('file', 'describeinstances.json') }}"
@@ -34,14 +31,12 @@ openshift_aws_ami_name: openshift-gi
 openshift_aws_base_ami_name: ami_base
 
 openshift_aws_launch_config_bootstrap_token: ''
-openshift_aws_launch_config_basename: "{{ openshift_aws_clusterid }}"
 
 openshift_aws_users: []
 
 openshift_aws_ami_tags:
   bootstrap: "true"
   openshift-created: "true"
-  clusterid: "{{ openshift_aws_clusterid }}"
   parent: "{{ openshift_aws_base_ami | default('unknown') }}"
 
 openshift_aws_s3_mode: create
@@ -124,6 +119,20 @@ openshift_aws_ami_map:
   infra: "{{ openshift_aws_ami }}"
   compute: "{{ openshift_aws_ami }}"
 
+openshift_aws_master_group:
+- name: "{{ openshift_aws_clusterid }} master group"
+  group: master
+
+openshift_aws_node_groups:
+- name: "{{ openshift_aws_clusterid }} compute group"
+  group: compute
+- name: "{{ openshift_aws_clusterid }} infra group"
+  group: infra
+
+openshift_aws_created_asgs: []
+openshift_aws_current_asgs: []
+
+# these will be used during upgrade
 openshift_aws_master_group_config:
   # The 'master' key is always required here.
   master:
@@ -139,7 +148,6 @@ openshift_aws_master_group_config:
       host-type: master
       sub-host-type: default
       runtime: docker
-      version: "{{ openshift_aws_new_version }}"
     wait_for_instances: True
     termination_policy: "{{ openshift_aws_node_group_termination_policy }}"
     replace_all_instances: "{{ openshift_aws_node_group_replace_all_instances }}"
@@ -163,7 +171,6 @@ openshift_aws_node_group_config:
       host-type: node
       sub-host-type: compute
       runtime: docker
-      version: "{{ openshift_aws_new_version }}"
     termination_policy: "{{ openshift_aws_node_group_termination_policy }}"
     replace_all_instances: "{{ openshift_aws_node_group_replace_all_instances }}"
     iam_role: "{{ openshift_aws_iam_role_name }}"
@@ -183,7 +190,6 @@ openshift_aws_node_group_config:
       host-type: node
       sub-host-type: infra
       runtime: docker
-      version: "{{ openshift_aws_new_version }}"
     termination_policy: "{{ openshift_aws_node_group_termination_policy }}"
     replace_all_instances: "{{ openshift_aws_node_group_replace_all_instances }}"
     iam_role: "{{ openshift_aws_iam_role_name }}"
@@ -283,21 +289,4 @@ openshift_aws_node_run_bootstrap_startup: True
 openshift_aws_node_user_data: ''
 openshift_aws_node_config_namespace: openshift-node
 
-openshift_aws_node_groups: nodes
-
 openshift_aws_masters_groups: masters,etcd,nodes
-
-# If creating extra node groups, you'll need to define all of the following
-
-# The format is the same as openshift_aws_node_group_config, but the top-level
-# key names should be different (ie, not == master or infra).
-# openshift_aws_node_group_config_extra: {}
-
-# This variable should look like openshift_aws_launch_config_security_groups
-# and contain a one-to-one mapping of top level keys that are defined in
-# openshift_aws_node_group_config_extra.
-# openshift_aws_launch_config_security_groups_extra: {}
-
-# openshift_aws_node_security_groups_extra: {}
-
-# openshift_aws_ami_map_extra: {}

+ 34 - 1
roles/openshift_aws/filter_plugins/openshift_aws_filters.py

@@ -4,11 +4,43 @@
 Custom filters for use in openshift_aws
 '''
 
+from ansible import errors
+
 
 class FilterModule(object):
     ''' Custom ansible filters for use by openshift_aws role'''
 
     @staticmethod
+    def scale_groups_serial(scale_group_info, upgrade=False):
+        ''' This function will determine what the deployment serial should be and return it
+
+          Search through the tags and find the deployment_serial tag. Once found,
+          determine if an increment is needed during an upgrade.
+          if upgrade is true then increment the serial and return it
+          else return the serial
+        '''
+        if scale_group_info == []:
+            return 1
+
+        scale_group_info = scale_group_info[0]
+
+        if not isinstance(scale_group_info, dict):
+            raise errors.AnsibleFilterError("|filter plugin failed: Expected scale_group_info to be a dict")
+
+        serial = None
+
+        for tag in scale_group_info['tags']:
+            if tag['key'] == 'deployment_serial':
+                serial = int(tag['value'])
+                if upgrade:
+                    serial += 1
+                break
+        else:
+            raise errors.AnsibleFilterError("|filter plugin failed: deployment_serial tag was not found")
+
+        return serial
+
+    @staticmethod
     def scale_groups_match_capacity(scale_group_info):
         ''' This function will verify that the scale group instance count matches
             the scale group desired capacity
@@ -38,4 +70,5 @@ class FilterModule(object):
     def filters(self):
         ''' returns a mapping of filters to methods '''
         return {'build_instance_tags': self.build_instance_tags,
-                'scale_groups_match_capacity': self.scale_groups_match_capacity}
+                'scale_groups_match_capacity': self.scale_groups_match_capacity,
+                'scale_groups_serial': self.scale_groups_serial}

+ 8 - 3
roles/openshift_aws/tasks/accept_nodes.yml

@@ -1,6 +1,6 @@
 ---
 - name: fetch masters
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region | default('us-east-1') }}"
     filters:
       "{{ {'tag:kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid,
@@ -11,7 +11,7 @@
   until: "'instances' in mastersout and mastersout.instances|length > 0"
 
 - name: fetch new node instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "{{ {'tag:kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid,
@@ -22,9 +22,14 @@
   delay: 3
   until: "'instances' in instancesout and instancesout.instances|length > 0"
 
-- debug:
+- name: Dump the private dns names
+  debug:
     msg: "{{ instancesout.instances|map(attribute='private_dns_name') | list }}"
 
+- name: Dump the master public ip address
+  debug:
+    msg: "{{ mastersout.instances[0].public_ip_address }}"
+
 - name: approve nodes
   oc_adm_csr:
     #approve_all: True

+ 29 - 2
roles/openshift_aws/tasks/build_node_group.yml

@@ -1,6 +1,4 @@
 ---
-# This task file expects l_nodes_to_build to be passed in.
-
 # When openshift_aws_use_custom_ami is '' then
 # we retrieve the latest build AMI.
 # Then set openshift_aws_ami to the ami.
@@ -26,6 +24,35 @@
 # Need to set epoch time in one place to use for launch_config and scale_group
 - set_fact:
     l_epoch_time: "{{ ansible_date_time.epoch }}"
+#
+# query asg's and determine if we need to create the others.
+# if we find more than 1 for each type, then exit
+- name: query all asg's for this cluster
+  ec2_asg_facts:
+    region: "{{ openshift_aws_region }}"
+    tags: "{{ {'kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid} | combine(l_node_group_config[openshift_aws_node_group.group].tags) }}"
+  register: asgs
+
+- fail:
+    msg: "Found more than 1 auto scaling group that matches the query for group: {{ openshift_aws_node_group }}"
+  when:
+  - asgs.results|length > 1
+
+- debug:
+    msg: "{{ asgs }}"
+
+- name: set the value for the deployment_serial and the current asgs
+  set_fact:
+    l_deployment_serial: "{{  openshift_aws_node_group_deployment_serial if openshift_aws_node_group_deployment_serial is defined else asgs.results | scale_groups_serial(openshift_aws_node_group_upgrade) }}"
+    openshift_aws_current_asgs: "{{ asgs.results | map(attribute='auto_scaling_group_name') | list | union(openshift_aws_current_asgs) }}"
+
+- name: dump deployment serial
+  debug:
+    msg: "Deployment serial: {{ l_deployment_serial }}"
+
+- name: dump current_asgs
+  debug:
+    msg: "openshift_aws_current_asgs: {{ openshift_aws_current_asgs }}"
 
 - when: openshift_aws_create_iam_role
   include_tasks: iam_role.yml

+ 6 - 8
roles/openshift_aws/tasks/iam_role.yml

@@ -13,11 +13,10 @@
 #####
 - name: Create an iam role
   iam_role:
-    name: "{{ item.value.iam_role }}"
+    name: "{{ l_node_group_config[openshift_aws_node_group.group].iam_role }}"
     assume_role_policy_document: "{{ lookup('file','trustpolicy.json') }}"
     state: "{{ openshift_aws_iam_role_state | default('present') }}"
-  when: item.value.iam_role is defined
-  with_dict: "{{ l_nodes_to_build }}"
+  when: l_node_group_config[openshift_aws_node_group.group].iam_role is defined
 
 #####
 # The second part of this task file is linking the role to a policy
@@ -28,9 +27,8 @@
 - name: create an iam policy
   iam_policy:
     iam_type: role
-    iam_name: "{{ item.value.iam_role }}"
-    policy_json: "{{ item.value.policy_json }}"
-    policy_name: "{{ item.value.policy_name }}"
+    iam_name: "{{ l_node_group_config[openshift_aws_node_group.group].iam_role }}"
+    policy_json: "{{ l_node_group_config[openshift_aws_node_group.group].policy_json }}"
+    policy_name: "{{ l_node_group_config[openshift_aws_node_group.group].policy_name }}"
     state: "{{ openshift_aws_iam_role_state | default('present') }}"
-  when: item.value.iam_role is defined
-  with_dict: "{{ l_nodes_to_build }}"
+  when: "'iam_role' in l_node_group_config[openshift_aws_node_group.group]"

+ 24 - 13
roles/openshift_aws/tasks/launch_config.yml

@@ -1,15 +1,26 @@
 ---
-- fail:
-    msg: "Ensure that an AMI value is defined for openshift_aws_ami or openshift_aws_launch_config_custom_image."
-  when:
-  - openshift_aws_ami is undefined
+- name: fetch the security groups for launch config
+  ec2_group_facts:
+    filters:
+      group-name: "{{ openshift_aws_launch_config_security_groups[openshift_aws_node_group.group] }}"
+      vpc-id: "{{ vpcout.vpcs[0].id }}"
+    region: "{{ openshift_aws_region }}"
+  register: ec2sgs
 
-- fail:
-    msg: "Ensure that openshift_deployment_type is defined."
-  when:
-  - openshift_deployment_type is undefined
-
-- include_tasks: launch_config_create.yml
-  with_dict: "{{ l_nodes_to_build }}"
-  loop_control:
-    loop_var: launch_config_item
+# Create the scale group config
+- name: Create the node scale group launch config
+  ec2_lc:
+    name: "{{ openshift_aws_node_group.name }}-{{ openshift_aws_ami_map[openshift_aws_node_group.group] | default(openshift_aws_ami) }}-{{ l_epoch_time }}"
+    region: "{{ openshift_aws_region }}"
+    image_id: "{{ openshift_aws_ami_map[openshift_aws_node_group.group] | default(openshift_aws_ami) }}"
+    instance_type: "{{ l_node_group_config[openshift_aws_node_group.group].instance_type }}"
+    security_groups: "{{ openshift_aws_launch_config_security_group_id  | default(ec2sgs.security_groups | map(attribute='group_id')| list) }}"
+    instance_profile_name: "{{ l_node_group_config[openshift_aws_node_group.group].iam_role if l_node_group_config[openshift_aws_node_group.group].iam_role is defined and
+                                                                           l_node_group_config[openshift_aws_node_group.group].iam_role != '' and
+                                                                           openshift_aws_create_iam_role
+                                                                        else omit }}"
+    user_data: "{{ lookup('template', 'user_data.j2') }}"
+    key_name: "{{ openshift_aws_ssh_key_name }}"
+    ebs_optimized: False
+    volumes: "{{ l_node_group_config[openshift_aws_node_group.group].volumes }}"
+    assign_public_ip: True

+ 0 - 26
roles/openshift_aws/tasks/launch_config_create.yml

@@ -1,26 +0,0 @@
----
-- name: fetch the security groups for launch config
-  ec2_group_facts:
-    filters:
-      group-name: "{{ l_launch_config_security_groups[launch_config_item.key] }}"
-      vpc-id: "{{ vpcout.vpcs[0].id }}"
-    region: "{{ openshift_aws_region }}"
-  register: ec2sgs
-
-# Create the scale group config
-- name: Create the node scale group launch config
-  ec2_lc:
-    name: "{{ openshift_aws_launch_config_basename }}-{{ launch_config_item.key }}{{'-' ~ openshift_aws_new_version if openshift_aws_new_version != '' else '' }}"
-    region: "{{ openshift_aws_region }}"
-    image_id: "{{ l_aws_ami_map[launch_config_item.key] | default(openshift_aws_ami) }}"
-    instance_type: "{{ launch_config_item.value.instance_type }}"
-    security_groups: "{{ openshift_aws_launch_config_security_group_id  | default(ec2sgs.security_groups | map(attribute='group_id')| list) }}"
-    instance_profile_name: "{{ launch_config_item.value.iam_role if launch_config_item.value.iam_role is defined and
-                                                                    launch_config_item.value.iam_role != '' and
-                                                                    openshift_aws_create_iam_role
-                                                                 else omit }}"
-    user_data: "{{ lookup('template', 'user_data.j2') }}"
-    key_name: "{{ openshift_aws_ssh_key_name }}"
-    ebs_optimized: False
-    volumes: "{{ launch_config_item.value.volumes }}"
-    assign_public_ip: True

+ 5 - 4
roles/openshift_aws/tasks/provision.yml

@@ -20,13 +20,14 @@
 
 - name: include scale group creation for master
   include_tasks: build_node_group.yml
+  with_items: "{{ openshift_aws_master_group }}"
   vars:
-    l_nodes_to_build: "{{ openshift_aws_master_group_config }}"
-    l_launch_config_security_groups: "{{ openshift_aws_launch_config_security_groups }}"
-    l_aws_ami_map: "{{ openshift_aws_ami_map }}"
+    l_node_group_config: "{{ openshift_aws_master_group_config }}"
+  loop_control:
+    loop_var: openshift_aws_node_group
 
 - name: fetch newly created instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "tag:clusterid": "{{ openshift_aws_clusterid }}"

+ 1 - 1
roles/openshift_aws/tasks/provision_instance.yml

@@ -27,7 +27,7 @@
       Name: "{{ openshift_aws_base_ami_name }}"
 
 - name: fetch newly created instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "tag:Name": "{{ openshift_aws_base_ami_name }}"

+ 7 - 12
roles/openshift_aws/tasks/provision_nodes.yml

@@ -3,7 +3,7 @@
 # bootstrap should be created on first master
 # need to fetch it and shove it into cloud data
 - name: fetch master instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "tag:clusterid": "{{ openshift_aws_clusterid }}"
@@ -31,20 +31,15 @@
 
 - name: include build compute and infra node groups
   include_tasks: build_node_group.yml
+  with_items: "{{ openshift_aws_node_groups }}"
   vars:
-    l_nodes_to_build: "{{ openshift_aws_node_group_config }}"
-    l_launch_config_security_groups: "{{ openshift_aws_launch_config_security_groups }}"
-    l_aws_ami_map: "{{ openshift_aws_ami_map }}"
-
-- name: include build node group for extra nodes
-  include_tasks: build_node_group.yml
-  when: openshift_aws_node_group_config_extra is defined
-  vars:
-    l_nodes_to_build: "{{ openshift_aws_node_group_config_extra | default({}) }}"
-    l_launch_config_security_groups: "{{ openshift_aws_launch_config_security_groups_extra }}"
-    l_aws_ami_map: "{{ openshift_aws_ami_map_extra }}"
+    l_node_group_config: "{{ openshift_aws_node_group_config }}"
+  loop_control:
+    loop_var: openshift_aws_node_group
 
 # instances aren't scaling fast enough here, we need to wait for them
 - when: openshift_aws_wait_for_ssh | bool
   name: wait for our new nodes to come up
   include_tasks: wait_for_groups.yml
+  vars:
+    created_asgs: "{{ openshift_aws_created_asgs }}"

+ 6 - 3
roles/openshift_aws/tasks/remove_scale_group.yml

@@ -1,10 +1,13 @@
 ---
+# FIGURE OUT HOW TO REMOVE SCALE GROUPS
+# use openshift_aws_current_asgs??
 - name: fetch the scale groups
   ec2_asg_facts:
     region: "{{ openshift_aws_region }}"
+    name: "^{{ item }}$"
     tags:
-      "{{ {'kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid,
-           'version': openshift_aws_current_version} }}"
+      "{{ {'kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid} }}"
+  with_items: "{{ openshift_aws_current_asgs if openshift_aws_current_asgs != [] else openshift_aws_asgs_to_remove }}"
   register: qasg
 
 - name: remove non-master scale groups
@@ -14,7 +17,7 @@
     name: "{{ item.auto_scaling_group_name }}"
   when: "'master'  not in item.auto_scaling_group_name"
   register: asg_results
-  with_items: "{{ qasg.results }}"
+  with_items: "{{ qasg | json_query('results[*]') | sum(attribute='results', start=[]) }}"
   async: 600
   poll: 0
 

+ 1 - 1
roles/openshift_aws/tasks/s3.yml

@@ -1,6 +1,6 @@
 ---
 - name: Create an s3 bucket
-  s3:
+  aws_s3:
     bucket: "{{ openshift_aws_s3_bucket_name }}"
     mode: "{{ openshift_aws_s3_mode }}"
     region: "{{ openshift_aws_region }}"

+ 23 - 13
roles/openshift_aws/tasks/scale_group.yml

@@ -1,20 +1,30 @@
 ---
+- name: set node group name
+  set_fact:
+    l_node_group_name: "{{ openshift_aws_node_group.name }} {{ l_deployment_serial }}"
+
 - name: Create the scale group
   ec2_asg:
-    name: "{{ openshift_aws_scale_group_basename }} {{ item.key }}"
-    launch_config_name: "{{ openshift_aws_launch_config_basename }}-{{ item.key }}{{ '-' ~ openshift_aws_new_version if openshift_aws_new_version != '' else '' }}"
-    health_check_period: "{{ item.value.health_check.period }}"
-    health_check_type: "{{ item.value.health_check.type }}"
-    min_size: "{{ item.value.min_size }}"
-    max_size: "{{ item.value.max_size }}"
-    desired_capacity: "{{ item.value.desired_size }}"
+    name: "{{ l_node_group_name }}"
+    launch_config_name: "{{ openshift_aws_node_group.name }}-{{ openshift_aws_ami_map[openshift_aws_node_group.group] | default(openshift_aws_ami) }}-{{ l_epoch_time }}"
+    health_check_period: "{{ l_node_group_config[openshift_aws_node_group.group].health_check.period }}"
+    health_check_type: "{{ l_node_group_config[openshift_aws_node_group.group].health_check.type }}"
+    min_size: "{{ l_node_group_config[openshift_aws_node_group.group].min_size }}"
+    max_size: "{{ l_node_group_config[openshift_aws_node_group.group].max_size }}"
+    desired_capacity: "{{ l_node_group_config[openshift_aws_node_group.group].desired_size }}"
     region: "{{ openshift_aws_region }}"
-    termination_policies: "{{ item.value.termination_policy if 'termination_policy' in  item.value else omit }}"
-    load_balancers: "{{ item.value.elbs if 'elbs' in item.value else omit }}"
-    wait_for_instances: "{{ item.value.wait_for_instances | default(False)}}"
+    termination_policies: "{{ l_node_group_config[openshift_aws_node_group.group].termination_policy if 'termination_policy' in  l_node_group_config[openshift_aws_node_group.group] else omit }}"
+    load_balancers: "{{ l_node_group_config[openshift_aws_node_group.group].elbs if 'elbs' in l_node_group_config[openshift_aws_node_group.group] else omit }}"
+    wait_for_instances: "{{ l_node_group_config[openshift_aws_node_group.group].wait_for_instances | default(False)}}"
     vpc_zone_identifier: "{{ subnetout.subnets[0].id }}"
     replace_instances: "{{ openshift_aws_node_group_replace_instances if openshift_aws_node_group_replace_instances != [] else omit }}"
-    replace_all_instances: "{{ omit if openshift_aws_node_group_replace_instances != [] else (item.value.replace_all_instances | default(omit)) }}"
+    replace_all_instances: "{{ omit if openshift_aws_node_group_replace_instances != []
+                                    else (l_node_group_config[openshift_aws_node_group.group].replace_all_instances | default(omit)) }}"
     tags:
-    - "{{ openshift_aws_node_group_config_tags | combine(item.value.tags) }}"
-  with_dict: "{{ l_nodes_to_build }}"
+    - "{{ openshift_aws_node_group_config_tags
+          | combine(l_node_group_config[openshift_aws_node_group.group].tags)
+          | combine({'deployment_serial': l_deployment_serial, 'ami': openshift_aws_ami_map[openshift_aws_node_group.group] | default(openshift_aws_ami)}) }}"
+
+- name: append the asg name to the openshift_aws_created_asgs fact
+  set_fact:
+    openshift_aws_created_asgs: "{{ [l_node_group_name] | union(openshift_aws_created_asgs) | list }}"

+ 3 - 3
roles/openshift_aws/tasks/seal_ami.yml

@@ -1,6 +1,6 @@
 ---
 - name: fetch newly created instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "tag:Name": "{{ openshift_aws_base_ami_name }}"
@@ -12,7 +12,7 @@
 
 - name: bundle ami
   ec2_ami:
-    instance_id: "{{ instancesout.instances.0.id }}"
+    instance_id: "{{ instancesout.instances.0.instance_id }}"
     region: "{{ openshift_aws_region }}"
     state: present
     description: "This was provisioned {{ ansible_date_time.iso8601 }}"
@@ -46,4 +46,4 @@
   ec2:
     state: absent
     region: "{{ openshift_aws_region }}"
-    instance_ids: "{{ instancesout.instances.0.id }}"
+    instance_ids: "{{ instancesout.instances.0.instance_id }}"

+ 23 - 7
roles/openshift_aws/tasks/security_group.yml

@@ -6,11 +6,27 @@
       "tag:Name": "{{ openshift_aws_clusterid }}"
   register: vpcout
 
-- include_tasks: security_group_create.yml
-  vars:
-    l_security_groups: "{{ openshift_aws_node_security_groups }}"
+- name: create the node group sgs
+  oo_ec2_group:
+    name: "{{ item.value.name}}"
+    description: "{{ item.value.desc }}"
+    rules: "{{ item.value.rules if 'rules' in item.value else [] }}"
+    region: "{{ openshift_aws_region }}"
+    vpc_id: "{{ vpcout.vpcs[0].id }}"
+  with_dict: "{{ openshift_aws_node_security_groups }}"
+
+- name: create the k8s sgs for the node group
+  oo_ec2_group:
+    name: "{{ item.value.name }}_k8s"
+    description: "{{ item.value.desc }} for k8s"
+    region: "{{ openshift_aws_region }}"
+    vpc_id: "{{ vpcout.vpcs[0].id }}"
+  with_dict: "{{ openshift_aws_node_security_groups }}"
+  register: k8s_sg_create
 
-- include_tasks: security_group_create.yml
-  when: openshift_aws_node_security_groups_extra is defined
-  vars:
-    l_security_groups: "{{ openshift_aws_node_security_groups_extra | default({}) }}"
+- name: tag sg groups with proper tags
+  ec2_tag:
+    tags: "{{ openshift_aws_security_groups_tags }}"
+    resource: "{{ item.group_id }}"
+    region: "{{ openshift_aws_region }}"
+  with_items: "{{ k8s_sg_create.results }}"

+ 0 - 25
roles/openshift_aws/tasks/security_group_create.yml

@@ -1,25 +0,0 @@
----
-- name: create the node group sgs
-  oo_ec2_group:
-    name: "{{ item.value.name}}"
-    description: "{{ item.value.desc }}"
-    rules: "{{ item.value.rules if 'rules' in item.value else [] }}"
-    region: "{{ openshift_aws_region }}"
-    vpc_id: "{{ vpcout.vpcs[0].id }}"
-  with_dict: "{{ l_security_groups }}"
-
-- name: create the k8s sgs for the node group
-  oo_ec2_group:
-    name: "{{ item.value.name }}_k8s"
-    description: "{{ item.value.desc }} for k8s"
-    region: "{{ openshift_aws_region }}"
-    vpc_id: "{{ vpcout.vpcs[0].id }}"
-  with_dict: "{{ l_security_groups }}"
-  register: k8s_sg_create
-
-- name: tag sg groups with proper tags
-  ec2_tag:
-    tags: "{{ openshift_aws_security_groups_tags }}"
-    resource: "{{ item.group_id }}"
-    region: "{{ openshift_aws_region }}"
-  with_items: "{{ k8s_sg_create.results }}"

+ 4 - 2
roles/openshift_aws/tasks/setup_master_group.yml

@@ -8,7 +8,7 @@
     msg: "openshift_aws_region={{ openshift_aws_region }}"
 
 - name: fetch newly created instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "tag:clusterid": "{{ openshift_aws_clusterid }}"
@@ -19,11 +19,13 @@
   delay: 3
   until: instancesout.instances|length > 0
 
+- debug: var=instancesout
+
 - name: add new master to masters group
   add_host:
     groups: "{{ openshift_aws_masters_groups }}"
     name: "{{ item.public_dns_name }}"
-    hostname: "{{ openshift_aws_clusterid }}-master-{{ item.id[:-5] }}"
+    hostname: "{{ openshift_aws_clusterid }}-master-{{ item.instance_id[:-5] }}"
   with_items: "{{ instancesout.instances }}"
 
 - name: wait for ssh to become available

+ 17 - 5
roles/openshift_aws/tasks/setup_scale_group_facts.yml

@@ -1,11 +1,15 @@
 ---
-- name: group scale group nodes
-  ec2_remote_facts:
+- name: fetch all created instances
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
-      "{{ {'tag:kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid }}}"
+      "{{ {'tag:kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid,
+           'instance-state-name': 'running'} }}"
   register: qinstances
 
+# The building of new and current groups is dependent of having a list of the current asgs and the created ones
+# that can be found in the variables: openshift_aws_created_asgs, openshift_aws_current_asgs.  If these do not exist, we cannot determine which hosts are
+# new and which hosts are current.
 - name: Build new node group
   add_host:
     groups: oo_sg_new_nodes
@@ -13,10 +17,16 @@
     name: "{{ item.public_dns_name }}"
     hostname: "{{ item.public_dns_name }}"
   when:
-  - (item.tags.version | default(False)) == openshift_aws_new_version
+  - openshift_aws_created_asgs != []
+  - "'aws:autoscaling:groupName' in item.tags"
+  - item.tags['aws:autoscaling:groupName'] in openshift_aws_created_asgs
   - "'node' in item.tags['host-type']"
   with_items: "{{ qinstances.instances }}"
 
+- name: dump openshift_aws_current_asgs
+  debug:
+    msg: "{{ openshift_aws_current_asgs }}"
+
 - name: Build current node group
   add_host:
     groups: oo_sg_current_nodes
@@ -24,7 +34,9 @@
     name: "{{ item.public_dns_name }}"
     hostname: "{{ item.public_dns_name }}"
   when:
-  - (item.tags.version | default('')) == openshift_aws_current_version
+  - openshift_aws_current_asgs != []
+  - "'aws:autoscaling:groupName' in item.tags"
+  - item.tags['aws:autoscaling:groupName'] in openshift_aws_current_asgs
   - "'node' in item.tags['host-type']"
   with_items: "{{ qinstances.instances }}"
 

+ 14 - 4
roles/openshift_aws/tasks/upgrade_node_group.yml

@@ -1,12 +1,22 @@
 ---
-- fail:
-    msg: 'Please ensure the current_version and new_version variables are not the same.'
+- include_tasks: provision_nodes.yml
+  vars:
+    openshift_aws_node_group_upgrade: True
   when:
-  - openshift_aws_current_version == openshift_aws_new_version
+  - openshift_aws_upgrade_provision_nodes | default(True)
 
-- include_tasks: provision_nodes.yml
+- debug: var=openshift_aws_current_asgs
+- debug: var=openshift_aws_created_asgs
+
+- name: fail if asg variables aren't set
+  fail:
+    msg: "Please ensure that openshift_aws_created_asgs and openshift_aws_current_asgs are defined."
+  when:
+  - openshift_aws_created_asgs == []
+  - openshift_aws_current_asgs == []
 
 - include_tasks: accept_nodes.yml
+  when: openshift_aws_upgrade_accept_nodes | default(True)
 
 - include_tasks: setup_scale_group_facts.yml
 

+ 16 - 6
roles/openshift_aws/tasks/wait_for_groups.yml

@@ -1,31 +1,41 @@
 ---
 # The idea here is to wait until all scale groups are at
 # their desired capacity before continuing.
-- name: fetch the scale groups
+#  This is accomplished with a custom filter_plugin and until clause
+- name: "fetch the scale groups"
   ec2_asg_facts:
     region: "{{ openshift_aws_region }}"
     tags:
-      "{{ {'kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid} }}"
+      "{{ {'kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid } }}"
   register: qasg
-  until: qasg.results | scale_groups_match_capacity | bool
+  until: qasg | json_query('results[*]') | scale_groups_match_capacity | bool
   delay: 10
   retries: 60
 
+- debug: var=openshift_aws_created_asgs
+
+# how do we gaurantee the instances are up?
 - name: fetch newly created instances
-  ec2_remote_facts:
+  ec2_instance_facts:
     region: "{{ openshift_aws_region }}"
     filters:
       "{{ {'tag:kubernetes.io/cluster/' ~ openshift_aws_clusterid: openshift_aws_clusterid,
-           'tag:version': openshift_aws_new_version} }}"
+           'tag:aws:autoscaling:groupName': item,
+           'instance-state-name': 'running'} }}"
+  with_items: "{{ openshift_aws_created_asgs if openshift_aws_created_asgs != [] else qasg | sum(attribute='results', start=[]) }}"
   register: instancesout
   until: instancesout.instances|length > 0
   delay: 5
   retries: 60
 
+- name: dump instances
+  debug:
+    msg: "{{ instancesout.results | sum(attribute='instances', start=[]) }}"
+
 - name: wait for ssh to become available
   wait_for:
     port: 22
     host: "{{ item.public_ip_address }}"
     timeout: 300
     search_regex: OpenSSH
-  with_items: "{{ instancesout.instances }}"
+  with_items: "{{ instancesout.results | sum(attribute='instances', start=[]) }}"

+ 3 - 3
roles/openshift_aws/templates/user_data.j2

@@ -7,8 +7,8 @@ write_files:
   owner: 'root:root'
   permissions: '0640'
   content: |
-    openshift_group_type: {{ launch_config_item.key }}
-{%   if launch_config_item.key != 'master' %}
+    openshift_group_type: {{ openshift_aws_node_group.group }}
+{%   if openshift_aws_node_group.group != 'master' %}
 - path: /etc/origin/node/bootstrap.kubeconfig
   owner: 'root:root'
   permissions: '0640'
@@ -19,7 +19,7 @@ runcmd:
 {%     if openshift_aws_node_run_bootstrap_startup %}
 - [ ansible-playbook, /root/openshift_bootstrap/bootstrap.yml]
 {%     endif %}
-{%     if launch_config_item.key != 'master' %}
+{%     if openshift_aws_node_group.group != 'master' %}
 - [ systemctl, restart, NetworkManager]
 - [ systemctl, enable, {% if openshift_deployment_type == 'openshift-enterprise' %}atomic-openshift{% else %}origin{% endif %}-node]
 - [ systemctl, start, {% if openshift_deployment_type == 'openshift-enterprise' %}atomic-openshift{% else %}origin{% endif %}-node]