Browse Source

Merge pull request #9711 from mgugino-upstream-stage/csr-approve-refactor

refactor csr approvals
Scott Dodson 6 years ago
parent
commit
6016dea184

+ 8 - 16
playbooks/openshift-node/private/join.yml

@@ -37,25 +37,17 @@
     debug:
       msg: "{{ l_nodes_to_join }}"
 
-  - name: Approve bootstrap nodes
-    oc_adm_csr:
-      nodes: "{{ l_nodes_to_join }}"
-      timeout: 60
-      fail_on_timeout: true
-    register: approve_out
-    ignore_errors: true
+  - name: Approve node certificates when bootstrapping
+    oc_csr_approve:
+      oc_bin: "{{ openshift_client_binary }}"
+      oc_conf: "{{ openshift.common.config_base }}/master/admin.kubeconfig"
+      node_list: "{{ l_nodes_to_join }}"
+    register: node_bootstrap_csr_approve
+    retries: 30
+    until: node_bootstrap_csr_approve is succeeded
     when:
     - l_nodes_to_join|length > 0
 
-  - when: approve_out is failed
-    block:
-    - name: Get CSRs
-      command: >
-        {{ openshift_client_binary }} describe csr --config={{ openshift.common.config_base }}/master/admin.kubeconfig
-    - name: Report approval errors
-      fail:
-        msg: Node approval failed
-
 - name: Ensure any inventory labels are applied to the nodes
   hosts: oo_nodes_to_config
   vars:

+ 306 - 0
roles/lib_openshift/library/oc_csr_approve.py

@@ -0,0 +1,306 @@
+#!/usr/bin/env python
+'''oc_csr_approve module'''
+# Copyright 2018 Red Hat, Inc. and/or its affiliates
+# and other contributors as indicated by the @author tags.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+
+from ansible.module_utils.basic import AnsibleModule
+
+try:
+    from json.decoder import JSONDecodeError
+except ImportError:
+    JSONDecodeError = ValueError
+
+DOCUMENTATION = '''
+---
+module: oc_csr_approve
+
+short_description: Retrieve, approve, and verify node client csrs
+
+version_added: "2.4"
+
+description:
+    - Runs various commands to list csrs, approve csrs, and verify nodes are
+      ready.
+
+author:
+    - "Michael Gugino <mgugino@redhat.com>"
+'''
+
+EXAMPLES = '''
+# Pass in a message
+- name: Place credentials in file
+  oc_csr_approve:
+    oc_bin: "/usr/bin/oc"
+    oc_conf: "/etc/origin/master/admin.kubeconfig"
+    node_list: ['node1.example.com', 'node2.example.com']
+'''
+
+CERT_MODE = {'client': 'client auth', 'server': 'server auth'}
+
+
+def run_command(module, command, rc_opts=None):
+    '''Run a command using AnsibleModule.run_command, or fail'''
+    if rc_opts is None:
+        rc_opts = {}
+    rtnc, stdout, err = module.run_command(command, **rc_opts)
+    if rtnc:
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': str(err),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+    return stdout
+
+
+def get_ready_nodes(module, oc_bin, oc_conf):
+    '''Get list of nodes currently ready vi oc'''
+    # json output is necessary for consistency here.
+    command = "{} {} get nodes -ojson".format(oc_bin, oc_conf)
+    stdout = run_command(module, command)
+
+    try:
+        data = json.loads(stdout)
+    except JSONDecodeError as err:
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': str(err),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+
+    ready_nodes = []
+    for node in data['items']:
+        if node.get('status') and node['status'].get('conditions'):
+            for condition in node['status']['conditions']:
+                # "True" is a string here, not a boolean.
+                if condition['type'] == "Ready" and condition['status'] == 'True':
+                    ready_nodes.append(node['metadata']['name'])
+    return ready_nodes
+
+
+def get_csrs(module, oc_bin, oc_conf):
+    '''Retrieve csrs from cluster using oc get csr -ojson'''
+    command = "{} {} get csr -ojson".format(oc_bin, oc_conf)
+    stdout = run_command(module, command)
+    try:
+        data = json.loads(stdout)
+    except JSONDecodeError as err:
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': str(err),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+    return data['items']
+
+
+def parse_subject_cn(subject_str):
+    '''parse output of openssl req -noout -subject to retrieve CN.
+       example input:
+         'subject=/C=US/CN=test.io/L=Raleigh/O=Red Hat/ST=North Carolina/OU=OpenShift\n'
+         or
+         'subject=C = US, CN = test.io, L = City, O = Company, ST = State, OU = Dept\n'
+       example output: 'test.io'
+    '''
+    stripped_string = subject_str[len('subject='):].strip()
+    kv_strings = [x.strip() for x in stripped_string.split(',')]
+    if len(kv_strings) == 1:
+        kv_strings = [x.strip() for x in stripped_string.split('/')][1:]
+    for item in kv_strings:
+        item_parts = [x.strip() for x in item.split('=')]
+        if item_parts[0] == 'CN':
+            return item_parts[1]
+
+
+def process_csrs(module, csrs, node_list, mode):
+    '''Return a dictionary of pending csrs where the format of the dict is
+       k=csr name, v=Subject Common Name'''
+    csr_dict = {}
+    for item in csrs:
+        status = item['status'].get('conditions')
+        if status:
+            # If status is not an empty dictionary, cert is not pending.
+            continue
+        if CERT_MODE[mode] not in item['spec']['usages']:
+            continue
+        name = item['metadata']['name']
+        request_data = base64.b64decode(item['spec']['request'])
+        command = "openssl req -noout -subject"
+        # ansible's module.run_command accepts data to pipe via stdin as
+        # as 'data' kwarg.
+        rc_opts = {'data': request_data, 'binary_data': True}
+        stdout = run_command(module, command, rc_opts=rc_opts)
+        # parse common_name from subject string.
+        common_name = parse_subject_cn(stdout)
+        if common_name and common_name.startswith('system:node:'):
+            # common name is typically prepended with system:node:.
+            common_name = common_name.split('system:node:')[1]
+        # we only want to approve csrs from nodes we know about.
+        if common_name in node_list:
+            csr_dict[name] = common_name
+
+    return csr_dict
+
+
+def confirm_needed_requests_present(module, not_ready_nodes, csr_dict):
+    '''Ensure all non-Ready nodes have a csr, or fail'''
+    nodes_needed = set(not_ready_nodes)
+    for _, val in csr_dict.items():
+        nodes_needed.discard(val)
+
+    # check that we found all of our needed nodes
+    if nodes_needed:
+        missing_nodes = ', '.join(nodes_needed)
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': "Cound not find csr for nodes: {}".format(missing_nodes),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+
+
+def approve_csrs(module, oc_bin, oc_conf, csr_pending_list, mode):
+    '''Loop through csr_pending_list and call:
+       oc adm certificate approve <item>'''
+    res_mode = "{}_approve_results".format(mode)
+    base_command = "{} {} adm certificate approve {}"
+    approve_results = []
+    for csr in csr_pending_list:
+        command = base_command.format(oc_bin, oc_conf, csr)
+        rtnc, stdout, err = module.run_command(command)
+        approve_results.append(stdout)
+        if rtnc:
+            result = {'failed': True,
+                      'changed': False,
+                      'msg': str(err),
+                      res_mode: approve_results,
+                      'state': 'unknown'}
+            module.fail_json(**result)
+    return approve_results
+
+
+def get_ready_nodes_server(module, oc_bin, oc_conf, nodes_list):
+    '''Determine which nodes have working server certificates'''
+    ready_nodes_server = []
+    base_command = "{} {} get --raw /api/v1/nodes/{}/proxy/healthz"
+    for node in nodes_list:
+        # need this to look like /api/v1/nodes/<node>/proxy/healthz
+        command = base_command.format(oc_bin, oc_conf, node)
+        rtnc, _, _ = module.run_command(command)
+        if not rtnc:
+            # if we can hit that api endpoint, the node has a valid server
+            # cert.
+            ready_nodes_server.append(node)
+    return ready_nodes_server
+
+
+def verify_server_csrs(module, result, oc_bin, oc_conf, node_list):
+    '''We approved some server csrs, now we need to validate they are working.
+       This function will attempt to retry 10 times in case of failure.'''
+    # Attempt to try node endpoints a few times.
+    attempts = 0
+    # Find not_ready_nodes for server-side again
+    nodes_server_ready = get_ready_nodes_server(module, oc_bin, oc_conf,
+                                                node_list)
+    # Create list of nodes that still aren't ready.
+    not_ready_nodes_server = set([item for item in node_list if item not in nodes_server_ready])
+    while not_ready_nodes_server:
+        nodes_server_ready = get_ready_nodes_server(module, oc_bin, oc_conf,
+                                                    not_ready_nodes_server)
+        # if we have same number of nodes_server_ready now, all of the previous
+        # not_ready_nodes are now ready.
+        if len(nodes_server_ready) == len(not_ready_nodes_server):
+            break
+        attempts += 1
+        if attempts > 9:
+            result['failed'] = True
+            result['rc'] = 1
+            missing_nodes = not_ready_nodes_server - set(nodes_server_ready)
+            msg = "Some nodes still not ready after approving server certs: {}"
+            msg = msg.format(", ".join(missing_nodes))
+            result['msg'] = msg
+
+
+def run_module():
+    '''Run this module'''
+    module_args = dict(
+        oc_bin=dict(type='path', required=False, default='oc'),
+        oc_conf=dict(type='path', required=False, default='/etc/origin/master/admin.kubeconfig'),
+        node_list=dict(type='list', required=True),
+    )
+    module = AnsibleModule(
+        supports_check_mode=False,
+        argument_spec=module_args
+    )
+    oc_bin = module.params['oc_bin']
+    oc_conf = '--config={}'.format(module.params['oc_conf'])
+    node_list = module.params['node_list']
+
+    result = {'changed': False, 'rc': 0}
+
+    nodes_ready = get_ready_nodes(module, oc_bin, oc_conf)
+    # don't need to check nodes that are already ready.
+    not_ready_nodes = [item for item in node_list if item not in nodes_ready]
+
+    # Get all csrs, no good way to filter on pending.
+    csrs = get_csrs(module, oc_bin, oc_conf)
+
+    # process data in csrs and build a dictionary of client requests
+    csr_dict = process_csrs(module, csrs, node_list, "client")
+
+    # This method is fail-happy and expects all non-Ready nodes have available
+    # csrs.  Handle failure for this method via ansible retry/until.
+    confirm_needed_requests_present(module, not_ready_nodes, csr_dict)
+
+    # save client_approve_results so we can report later.
+    client_approve_results = approve_csrs(module, oc_bin, oc_conf, csr_dict,
+                                          'client')
+    result['client_approve_results'] = client_approve_results
+
+    # # Server Cert Section # #
+    # Find not_ready_nodes for server-side
+    nodes_server_ready = get_ready_nodes_server(module, oc_bin, oc_conf,
+                                                node_list)
+    # Create list of nodes that definitely need a server cert approved.
+    not_ready_nodes_server = [item for item in node_list if item not in nodes_server_ready]
+
+    # Get all csrs again, no good way to filter on pending.
+    csrs = get_csrs(module, oc_bin, oc_conf)
+
+    # process data in csrs and build a dictionary of server requests
+    csr_dict = process_csrs(module, csrs, node_list, "server")
+
+    # This will fail if all server csrs are not present, but probably shouldn't
+    # at this point since we spent some time hitting the api to see if the
+    # nodes are already responding.
+    confirm_needed_requests_present(module, not_ready_nodes_server, csr_dict)
+    server_approve_results = approve_csrs(module, oc_bin, oc_conf, csr_dict,
+                                          'server')
+    result['server_approve_results'] = server_approve_results
+
+    result['changed'] = bool(client_approve_results) or bool(server_approve_results)
+
+    verify_server_csrs(module, result, oc_bin, oc_conf, node_list)
+
+    module.exit_json(**result)
+
+
+def main():
+    '''main'''
+    run_module()
+
+
+if __name__ == '__main__':
+    main()

File diff suppressed because it is too large
+ 85 - 0
roles/lib_openshift/test/test_data/oc_csr_approve_approved.json


+ 9 - 0
roles/lib_openshift/test/test_data/oc_csr_approve_none.json

@@ -0,0 +1,9 @@
+{
+    "apiVersion": "v1",
+    "items": [],
+    "kind": "List",
+    "metadata": {
+        "resourceVersion": "",
+        "selfLink": ""
+    }
+}

File diff suppressed because it is too large
+ 38 - 0
roles/lib_openshift/test/test_data/oc_csr_approve_pending.json


File diff suppressed because it is too large
+ 361 - 0
roles/lib_openshift/test/test_data/oc_csr_server_multiple_pends_one_host.json


+ 450 - 0
roles/lib_openshift/test/test_data/oc_get_nodes.json

@@ -0,0 +1,450 @@
+{
+    "apiVersion": "v1",
+    "items": [
+        {
+            "apiVersion": "v1",
+            "kind": "Node",
+            "metadata": {
+                "annotations": {
+                    "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+                },
+                "creationTimestamp": "2018-08-10T23:50:59Z",
+                "labels": {
+                    "beta.kubernetes.io/arch": "amd64",
+                    "beta.kubernetes.io/os": "linux",
+                    "glusterfs": "storage-host",
+                    "kubernetes.io/hostname": "fedora1.openshift.io",
+                    "node-role.kubernetes.io/compute": "true",
+                    "node-role.kubernetes.io/infra": "true",
+                    "node-role.kubernetes.io/master": "true"
+                },
+                "name": "fedora1.openshift.io",
+                "namespace": "",
+                "resourceVersion": "1732411",
+                "selfLink": "/api/v1/nodes/fedora1.openshift.io",
+                "uid": "3b52eed5-9cf8-11e8-964a-525400650cba"
+            },
+            "spec": {
+                "externalID": "fedora1.openshift.io"
+            },
+            "status": {
+                "addresses": [
+                    {
+                        "address": "192.168.124.106",
+                        "type": "InternalIP"
+                    },
+                    {
+                        "address": "fedora1.openshift.io",
+                        "type": "Hostname"
+                    }
+                ],
+                "allocatable": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8070076Ki",
+                    "pods": "250"
+                },
+                "capacity": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8172476Ki",
+                    "pods": "250"
+                },
+                "conditions": [
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has sufficient disk space available",
+                        "reason": "KubeletHasSufficientDisk",
+                        "status": "False",
+                        "type": "OutOfDisk"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has sufficient memory available",
+                        "reason": "KubeletHasSufficientMemory",
+                        "status": "False",
+                        "type": "MemoryPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has no disk pressure",
+                        "reason": "KubeletHasNoDiskPressure",
+                        "status": "False",
+                        "type": "DiskPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has sufficient PID available",
+                        "reason": "KubeletHasSufficientPID",
+                        "status": "False",
+                        "type": "PIDPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-11T00:01:06Z",
+                        "message": "kubelet is posting ready status",
+                        "reason": "KubeletReady",
+                        "status": "True",
+                        "type": "Ready"
+                    }
+                ],
+                "daemonEndpoints": {
+                    "kubeletEndpoint": {
+                        "Port": 10250
+                    }
+                },
+                "images": [
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-node@sha256:d8e0b4e5912e12e84ccd2b72a90ce66ce6e5569dfcc62f9cd69f0315d59c6a91",
+                            "docker.io/openshift/origin-node:v3.10",
+                            "docker.io/openshift/origin-node:v3.10.0"
+                        ],
+                        "sizeBytes": 1281495850
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-control-plane@sha256:8a030a68593d64703c0572454d3fd9475bcfadf5d26d2899f92418516c1c49be",
+                            "docker.io/openshift/origin-control-plane:v3.10",
+                            "docker.io/openshift/origin-control-plane:v3.10.0"
+                        ],
+                        "sizeBytes": 815862538
+                    },
+                    {
+                        "names": [
+                            "docker.io/gluster/gluster-centos@sha256:850fd2399d254f678b40bebe1602aa0c46d60facc7804b922c81c1524e05903a",
+                            "docker.io/gluster/gluster-centos:latest"
+                        ],
+                        "sizeBytes": 328338103
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-pod@sha256:6ae0714fe9bf19f1312e2a869bc3d7b7cd01aea330c33675f1e215e3de857385",
+                            "docker.io/openshift/origin-pod:v3.10.0"
+                        ],
+                        "sizeBytes": 222597999
+                    },
+                    {
+                        "names": [
+                            "quay.io/coreos/etcd@sha256:43fbc8a457aa0cb887da63d74a48659e13947cb74b96a53ba8f47abb6172a948",
+                            "quay.io/coreos/etcd:v3.2.22"
+                        ],
+                        "sizeBytes": 37269372
+                    }
+                ],
+                "nodeInfo": {
+                    "architecture": "amd64",
+                    "bootID": "fc58c6b9-9f67-4377-8cbe-57f0c3f7a517",
+                    "containerRuntimeVersion": "docker://1.13.1",
+                    "kernelVersion": "4.13.9-300.fc27.x86_64",
+                    "kubeProxyVersion": "v1.10.0+b81c8f8",
+                    "kubeletVersion": "v1.10.0+b81c8f8",
+                    "machineID": "57f56a8c5aeb47a98ca1fd94281c64aa",
+                    "operatingSystem": "linux",
+                    "osImage": "Fedora 27 (Cloud Edition)",
+                    "systemUUID": "57F56A8C-5AEB-47A9-8CA1-FD94281C64AA"
+                }
+            }
+        },
+        {
+            "apiVersion": "v1",
+            "kind": "Node",
+            "metadata": {
+                "annotations": {
+                    "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+                },
+                "creationTimestamp": "2018-08-10T23:53:32Z",
+                "labels": {
+                    "beta.kubernetes.io/arch": "amd64",
+                    "beta.kubernetes.io/os": "linux",
+                    "glusterfs": "storage-host",
+                    "kubernetes.io/hostname": "fedora2.openshift.io",
+                    "node-role.kubernetes.io/infra": "true"
+                },
+                "name": "fedora2.openshift.io",
+                "namespace": "",
+                "resourceVersion": "1732413",
+                "selfLink": "/api/v1/nodes/fedora2.openshift.io",
+                "uid": "965edafb-9cf8-11e8-964a-525400650cba"
+            },
+            "spec": {
+                "externalID": "fedora2.openshift.io"
+            },
+            "status": {
+                "addresses": [
+                    {
+                        "address": "192.168.124.48",
+                        "type": "InternalIP"
+                    },
+                    {
+                        "address": "fedora2.openshift.io",
+                        "type": "Hostname"
+                    }
+                ],
+                "allocatable": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8070076Ki",
+                    "pods": "250"
+                },
+                "capacity": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8172476Ki",
+                    "pods": "250"
+                },
+                "conditions": [
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:40:58Z",
+                        "message": "kubelet has sufficient disk space available",
+                        "reason": "KubeletHasSufficientDisk",
+                        "status": "False",
+                        "type": "OutOfDisk"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:40:58Z",
+                        "message": "kubelet has sufficient memory available",
+                        "reason": "KubeletHasSufficientMemory",
+                        "status": "False",
+                        "type": "MemoryPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:40:58Z",
+                        "message": "kubelet has no disk pressure",
+                        "reason": "KubeletHasNoDiskPressure",
+                        "status": "False",
+                        "type": "DiskPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient PID available",
+                        "reason": "KubeletHasSufficientPID",
+                        "status": "False",
+                        "type": "PIDPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:41:08Z",
+                        "message": "kubelet is posting ready status",
+                        "reason": "KubeletReady",
+                        "status": "False",
+                        "type": "Ready"
+                    }
+                ],
+                "daemonEndpoints": {
+                    "kubeletEndpoint": {
+                        "Port": 10250
+                    }
+                },
+                "images": [
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-node@sha256:d8e0b4e5912e12e84ccd2b72a90ce66ce6e5569dfcc62f9cd69f0315d59c6a91",
+                            "docker.io/openshift/origin-node:v3.10",
+                            "docker.io/openshift/origin-node:v3.10.0"
+                        ],
+                        "sizeBytes": 1281495850
+                    },
+                    {
+                        "names": [
+                            "docker.io/heketi/heketi@sha256:d847e721966c6b6b09a50cbe3ec209d7d6cf4ad7cca204cf114028c98a39aecd",
+                            "docker.io/heketi/heketi:latest"
+                        ],
+                        "sizeBytes": 361586900
+                    },
+                    {
+                        "names": [
+                            "docker.io/gluster/gluster-centos@sha256:850fd2399d254f678b40bebe1602aa0c46d60facc7804b922c81c1524e05903a",
+                            "docker.io/gluster/gluster-centos:latest"
+                        ],
+                        "sizeBytes": 328338103
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-pod@sha256:6ae0714fe9bf19f1312e2a869bc3d7b7cd01aea330c33675f1e215e3de857385",
+                            "docker.io/openshift/origin-pod:v3.10.0"
+                        ],
+                        "sizeBytes": 222597999
+                    }
+                ],
+                "nodeInfo": {
+                    "architecture": "amd64",
+                    "bootID": "9bced612-abc1-4129-8d92-b17e786df8dd",
+                    "containerRuntimeVersion": "docker://1.13.1",
+                    "kernelVersion": "4.13.9-300.fc27.x86_64",
+                    "kubeProxyVersion": "v1.10.0+b81c8f8",
+                    "kubeletVersion": "v1.10.0+b81c8f8",
+                    "machineID": "a883f7e82e0645578114dafea6fca8bb",
+                    "operatingSystem": "linux",
+                    "osImage": "Fedora 27 (Cloud Edition)",
+                    "systemUUID": "A883F7E8-2E06-4557-8114-DAFEA6FCA8BB"
+                }
+            }
+        },
+        {
+            "apiVersion": "v1",
+            "kind": "Node",
+            "metadata": {
+                "annotations": {
+                    "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+                },
+                "creationTimestamp": "2018-08-10T23:53:32Z",
+                "labels": {
+                    "beta.kubernetes.io/arch": "amd64",
+                    "beta.kubernetes.io/os": "linux",
+                    "glusterfs": "storage-host",
+                    "kubernetes.io/hostname": "fedora3.openshift.io",
+                    "node-role.kubernetes.io/infra": "true"
+                },
+                "name": "fedora3.openshift.io",
+                "namespace": "",
+                "resourceVersion": "1732410",
+                "selfLink": "/api/v1/nodes/fedora3.openshift.io",
+                "uid": "9646e307-9cf8-11e8-964a-525400650cba"
+            },
+            "spec": {
+                "externalID": "fedora3.openshift.io"
+            },
+            "status": {
+                "addresses": [
+                    {
+                        "address": "192.168.124.171",
+                        "type": "InternalIP"
+                    },
+                    {
+                        "address": "fedora3.openshift.io",
+                        "type": "Hostname"
+                    }
+                ],
+                "allocatable": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8070068Ki",
+                    "pods": "250"
+                },
+                "capacity": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8172468Ki",
+                    "pods": "250"
+                },
+                "conditions": [
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient disk space available",
+                        "reason": "KubeletHasSufficientDisk",
+                        "status": "False",
+                        "type": "OutOfDisk"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient memory available",
+                        "reason": "KubeletHasSufficientMemory",
+                        "status": "False",
+                        "type": "MemoryPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has no disk pressure",
+                        "reason": "KubeletHasNoDiskPressure",
+                        "status": "False",
+                        "type": "DiskPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient PID available",
+                        "reason": "KubeletHasSufficientPID",
+                        "status": "False",
+                        "type": "PIDPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-11T00:01:06Z",
+                        "message": "kubelet is posting ready status",
+                        "reason": "KubeletReady",
+                        "status": "True",
+                        "type": "Ready"
+                    }
+                ],
+                "daemonEndpoints": {
+                    "kubeletEndpoint": {
+                        "Port": 10250
+                    }
+                },
+                "images": [
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-node@sha256:d8e0b4e5912e12e84ccd2b72a90ce66ce6e5569dfcc62f9cd69f0315d59c6a91",
+                            "docker.io/openshift/origin-node:v3.10",
+                            "docker.io/openshift/origin-node:v3.10.0"
+                        ],
+                        "sizeBytes": 1281495850
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-deployer@sha256:72d013cdfdf3d16557b64ac0a459c2fc4e90e37422ceed1564a2f69d68607e2a",
+                            "docker.io/openshift/origin-deployer:v3.10.0"
+                        ],
+                        "sizeBytes": 815862538
+                    },
+                    {
+                        "names": [
+                            "docker.io/heketi/heketi@sha256:e6d0362d217573a3f92792e14c611d75df04eb7bc8f245e8c44c4a9c3a870ee1",
+                            "docker.io/heketi/heketi:latest"
+                        ],
+                        "sizeBytes": 384664289
+                    },
+                    {
+                        "names": [
+                            "docker.io/gluster/gluster-centos@sha256:850fd2399d254f678b40bebe1602aa0c46d60facc7804b922c81c1524e05903a",
+                            "docker.io/gluster/gluster-centos:latest"
+                        ],
+                        "sizeBytes": 328338103
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-pod@sha256:6ae0714fe9bf19f1312e2a869bc3d7b7cd01aea330c33675f1e215e3de857385",
+                            "docker.io/openshift/origin-pod:v3.10.0"
+                        ],
+                        "sizeBytes": 222597999
+                    }
+                ],
+                "nodeInfo": {
+                    "architecture": "amd64",
+                    "bootID": "a81e3aa0-bf11-432d-b671-aa7d86344c3f",
+                    "containerRuntimeVersion": "docker://1.13.1",
+                    "kernelVersion": "4.13.9-300.fc27.x86_64",
+                    "kubeProxyVersion": "v1.10.0+b81c8f8",
+                    "kubeletVersion": "v1.10.0+b81c8f8",
+                    "machineID": "95bf4677a2ac4f8daa29a31efdb09eed",
+                    "operatingSystem": "linux",
+                    "osImage": "Fedora 27 (Cloud Edition)",
+                    "systemUUID": "95BF4677-A2AC-4F8D-AA29-A31EFDB09EED"
+                }
+            }
+        }
+    ],
+    "kind": "List",
+    "metadata": {
+        "resourceVersion": "",
+        "selfLink": ""
+    }
+}

+ 1 - 0
roles/lib_openshift/test/test_data/openssl1.txt

@@ -0,0 +1 @@
+subject=C = US, CN = fedora1.openshift.io, L = Raleigh, O = Red Hat, ST = North Carolina, OU = OpenShift

+ 162 - 0
roles/lib_openshift/test/test_oc_csr_approve.py

@@ -0,0 +1,162 @@
+import os
+import sys
+
+import pytest
+
+from ansible.module_utils.basic import AnsibleModule
+
+try:
+    # python3, mock is built in.
+    from unittest.mock import patch
+except ImportError:
+    # In python2, mock is installed via pip.
+    from mock import patch
+
+MODULE_PATH = os.path.realpath(os.path.join(__file__, os.pardir, os.pardir, 'library'))
+sys.path.insert(1, MODULE_PATH)
+
+import oc_csr_approve  # noqa
+
+# base path for text files with sample outputs.
+ASSET_PATH = os.path.realpath(os.path.join(__file__, os.pardir, 'test_data'))
+
+RUN_CMD_MOCK = 'ansible.module_utils.basic.AnsibleModule.run_command'
+
+
+class DummyModule(AnsibleModule):
+    def _load_params(self):
+        self.params = {}
+
+    def exit_json(*args, **kwargs):
+        return 0
+
+    def fail_json(*args, **kwargs):
+        raise Exception(kwargs['msg'])
+
+
+def test_parse_subject_cn():
+    subject = 'subject=/C=US/CN=fedora1.openshift.io/L=Raleigh/O=Red Hat/ST=North Carolina/OU=OpenShift\n'
+    assert oc_csr_approve.parse_subject_cn(subject) == 'fedora1.openshift.io'
+
+    subject = 'subject=C = US, CN = test.io, L = City, O = Company, ST = State, OU = Dept\n'
+    assert oc_csr_approve.parse_subject_cn(subject) == 'test.io'
+
+
+def test_get_ready_nodes():
+    output_file = os.path.join(ASSET_PATH, 'oc_get_nodes.json')
+    with open(output_file) as stdoutfile:
+        oc_get_nodes_stdout = stdoutfile.read()
+
+    module = DummyModule({})
+
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, oc_get_nodes_stdout, '')
+        ready_nodes = oc_csr_approve.get_ready_nodes(module, 'oc', '/dev/null')
+    print(ready_nodes)
+    assert ready_nodes == ['fedora1.openshift.io', 'fedora3.openshift.io']
+
+
+def test_get_csrs():
+    module = DummyModule({})
+    output_file = os.path.join(ASSET_PATH, 'oc_csr_approve_pending.json')
+    with open(output_file) as stdoutfile:
+        oc_get_csr_out = stdoutfile.read()
+
+    # mock oc get csr call to cluster
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, oc_get_csr_out, '')
+        csrs = oc_csr_approve.get_csrs(module, 'oc', '/dev/null')
+
+    assert csrs[0]['kind'] == "CertificateSigningRequest"
+
+    output_file = os.path.join(ASSET_PATH, 'openssl1.txt')
+    with open(output_file) as stdoutfile:
+        openssl_out = stdoutfile.read()
+
+    # mock openssl req call.
+    node_list = ['fedora2.mguginolocal.com']
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, openssl_out, '')
+        csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "client")
+    # actually run openssl req call.
+    csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "client")
+    assert csr_dict['node-csr-TkefytQp8Dz4Xp7uzcw605MocvI0gWuEOGNrHhOjGNQ'] == 'fedora2.mguginolocal.com'
+
+
+def test_confirm_needed_requests_present():
+    module = DummyModule({})
+    csr_dict = {'some-csr': 'fedora1.openshift.io'}
+    not_ready_nodes = ['host1']
+    with pytest.raises(Exception) as err:
+        oc_csr_approve.confirm_needed_requests_present(
+            module, not_ready_nodes, csr_dict)
+    assert 'Exception: Cound not find csr for nodes: host1' in str(err)
+
+    not_ready_nodes = ['fedora1.openshift.io']
+    # this should complete silently
+    oc_csr_approve.confirm_needed_requests_present(
+        module, not_ready_nodes, csr_dict)
+
+
+def test_approve_csrs():
+    module = DummyModule({})
+    oc_bin = 'oc'
+    oc_conf = '/dev/null'
+    csr_dict = {'csr-1': 'example.openshift.io'}
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, 'csr-1 ok', '')
+        client_approve_results = oc_csr_approve.approve_csrs(
+            module, oc_bin, oc_conf, csr_dict, 'client')
+    assert client_approve_results == ['csr-1 ok']
+
+
+def test_get_ready_nodes_server():
+    module = DummyModule({})
+    oc_bin = 'oc'
+    oc_conf = '/dev/null'
+    nodes_list = ['fedora1.openshift.io']
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, 'ok', '')
+        ready_nodes_server = oc_csr_approve.get_ready_nodes_server(
+            module, oc_bin, oc_conf, nodes_list)
+    assert ready_nodes_server == ['fedora1.openshift.io']
+
+
+def test_get_csrs_server():
+    module = DummyModule({})
+    output_file = os.path.join(ASSET_PATH, 'oc_csr_server_multiple_pends_one_host.json')
+    with open(output_file) as stdoutfile:
+        oc_get_csr_out = stdoutfile.read()
+
+    # mock oc get csr call to cluster
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, oc_get_csr_out, '')
+        csrs = oc_csr_approve.get_csrs(module, 'oc', '/dev/null')
+
+    assert csrs[0]['kind'] == "CertificateSigningRequest"
+
+    output_file = os.path.join(ASSET_PATH, 'openssl1.txt')
+    with open(output_file) as stdoutfile:
+        openssl_out = stdoutfile.read()
+
+    node_list = ['fedora1.openshift.io']
+
+    # mock openssl req call.
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, openssl_out, '')
+        csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "server")
+
+    # actually run openssl req call.
+    node_list = ['fedora2.mguginolocal.com']
+    csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "server")
+    assert csr_dict['csr-2cxkp'] == 'fedora2.mguginolocal.com'
+
+
+if __name__ == '__main__':
+    test_parse_subject_cn()
+    test_get_ready_nodes()
+    test_get_csrs()
+    test_confirm_needed_requests_present()
+    test_approve_csrs()
+    test_get_ready_nodes_server()
+    test_get_csrs_server()

+ 8 - 7
roles/openshift_aws/tasks/accept_nodes.yml

@@ -32,11 +32,12 @@
   debug:
     msg: "{{ mastersout.instances[0].public_ip_address }}"
 
-- name: approve nodes
-  oc_adm_csr:
-    #approve_all: True
-    nodes: "{{ instancesout.instances|map(attribute='private_dns_name') | list  }}"
-    timeout: 60
-    fail_on_timeout: "{{ openshift_aws_node_accept_fail_on_timeout | default(false) | bool }}"
-  register: nodeout
+- name: Approve node certificates when bootstrapping
+  oc_csr_approve:
+    oc_bin: "{{ hostvars[groups.masters.0]['first_master_client_binary'] }}"
+    oc_conf: "{{ hostvars[groups.masters.0].openshift.common.config_base }}/master/admin.kubeconfig"
+    node_list: "{{ instancesout.instances|map(attribute='private_dns_name') | list  }}"
+  register: aws_csr_approve
+  retries: 30
+  until: aws_csr_approve is succeeded
   delegate_to: "{{ groups.masters.0 }}"

+ 8 - 5
roles/openshift_gcp/tasks/configure_master_bootstrap.yml

@@ -29,9 +29,12 @@
     name: "openshift-bootstrap-update.timer"
     state: started
 
-- name: Bootstrap all nodes that were identified with bootstrap metadata
-  run_once: true
-  oc_adm_csr:
-    nodes: "{{ groups['all'] | map('extract', hostvars) | selectattr('gce_metadata.bootstrap', 'match', 'true') | map(attribute='gce_name') | list }}"
-    timeout: 60
+- name: Approve node certificates when bootstrapping
+  oc_csr_approve:
+    oc_bin: "{{ hostvars[groups.masters.0]['first_master_client_binary'] }}"
+    oc_conf: "{{ hostvars[groups.masters.0].openshift.common.config_base }}/master/admin.kubeconfig"
+    node_list: "{{ groups['all'] | map('extract', hostvars) | selectattr('gce_metadata.bootstrap', 'match', 'true') | map(attribute='gce_name') | list }}"
+  register: gcp_csr_approve
+  retries: 30
+  until: gcp_csr_approve is succeeded
   when: groups['all'] | map('extract', hostvars) | selectattr('gce_metadata.bootstrap', 'match', 'true') | map(attribute='gce_name') | list | length > 0

+ 8 - 25
roles/openshift_node/tasks/upgrade.yml

@@ -57,32 +57,15 @@
 - import_tasks: upgrade/restart.yml
 
 - name: Approve node certificates when bootstrapping
-  oc_adm_csr:
-    nodes: "{{ openshift.node.nodename | lower }}"
-    timeout: 180
-    fail_on_timeout: true
+  oc_csr_approve:
+    oc_bin: "{{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }}"
+    oc_conf: "{{ openshift.common.config_base }}/master/admin.kubeconfig"
+    node_list:
+    - "{{ openshift.node.nodename | lower }}"
   delegate_to: "{{ groups.oo_first_master.0 }}"
-  ignore_errors: true
-
-- name: Wait for node to be ready
-  oc_obj:
-    state: list
-    kind: node
-    name: "{{ openshift.node.nodename | lower }}"
-  register: node_output
-  delegate_to: "{{ groups.oo_first_master.0 }}"
-  until:
-  - node_output.results is defined
-  - node_output.results.returncode is defined
-  - node_output.results.returncode == 0
-  - node_output.results.results is defined
-  - node_output.results.results | length  > 0
-  - node_output.results.results[0].status is defined
-  - node_output.results.results[0].status.conditions is defined
-  - node_output.results.results[0].status.conditions | selectattr('type', 'match', '^Ready$') | map(attribute='status') | join | bool == True
-  # Give the node three minutes to come back online.
-  retries: 36
-  delay: 5
+  register: node_upgrade_oc_csr_approve
+  retries: 30
+  until: node_upgrade_oc_csr_approve is succeeded
 
 - import_tasks: journald.yml