Browse Source

Refactor csr approvals: oc_csr_approve

Currently, csr approval process for nodes is quite
fragile.

This commit creates a new custom module oc_csr_approve
which facilitates handling the multiple steps involved
for approving pending node certificates.

The module attempts to approve all 'client' csrs
for any nodes provided via node_list, missing csrs
are ignored as long as the missing node is in a
'Ready' status as reported by oc get nodes.

Next, the module approves csrs for 'server' certificates.
Similar to the client process, missing node csrs
are acceptable as long as the node's api endpoint
is reachable without error, indicating a server
certificate is deployed.

In cases of long delay between issuing a csr and
csr approval, there may be several outstanding
'server' csrs.  This module will approve any
outstanding csrs.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1571515
Michael Gugino 6 years ago
parent
commit
fef0430a82

+ 8 - 16
playbooks/openshift-node/private/join.yml

@@ -37,25 +37,17 @@
     debug:
       msg: "{{ l_nodes_to_join }}"
 
-  - name: Approve bootstrap nodes
-    oc_adm_csr:
-      nodes: "{{ l_nodes_to_join }}"
-      timeout: 60
-      fail_on_timeout: true
-    register: approve_out
-    ignore_errors: true
+  - name: Approve node certificates when bootstrapping
+    oc_csr_approve:
+      oc_bin: "{{ openshift_client_binary }}"
+      oc_conf: "{{ openshift.common.config_base }}/master/admin.kubeconfig"
+      node_list: "{{ l_nodes_to_join }}"
+    register: node_bootstrap_csr_approve
+    retries: 30
+    until: node_bootstrap_csr_approve is succeeded
     when:
     - l_nodes_to_join|length > 0
 
-  - when: approve_out is failed
-    block:
-    - name: Get CSRs
-      command: >
-        {{ openshift_client_binary }} describe csr --config={{ openshift.common.config_base }}/master/admin.kubeconfig
-    - name: Report approval errors
-      fail:
-        msg: Node approval failed
-
 - name: Ensure any inventory labels are applied to the nodes
   hosts: oo_nodes_to_config
   vars:

+ 306 - 0
roles/lib_openshift/library/oc_csr_approve.py

@@ -0,0 +1,306 @@
+#!/usr/bin/env python
+'''oc_csr_approve module'''
+# Copyright 2018 Red Hat, Inc. and/or its affiliates
+# and other contributors as indicated by the @author tags.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+
+from ansible.module_utils.basic import AnsibleModule
+
+try:
+    from json.decoder import JSONDecodeError
+except ImportError:
+    JSONDecodeError = ValueError
+
+DOCUMENTATION = '''
+---
+module: oc_csr_approve
+
+short_description: Retrieve, approve, and verify node client csrs
+
+version_added: "2.4"
+
+description:
+    - Runs various commands to list csrs, approve csrs, and verify nodes are
+      ready.
+
+author:
+    - "Michael Gugino <mgugino@redhat.com>"
+'''
+
+EXAMPLES = '''
+# Pass in a message
+- name: Place credentials in file
+  oc_csr_approve:
+    oc_bin: "/usr/bin/oc"
+    oc_conf: "/etc/origin/master/admin.kubeconfig"
+    node_list: ['node1.example.com', 'node2.example.com']
+'''
+
+CERT_MODE = {'client': 'client auth', 'server': 'server auth'}
+
+
+def run_command(module, command, rc_opts=None):
+    '''Run a command using AnsibleModule.run_command, or fail'''
+    if rc_opts is None:
+        rc_opts = {}
+    rtnc, stdout, err = module.run_command(command, **rc_opts)
+    if rtnc:
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': str(err),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+    return stdout
+
+
+def get_ready_nodes(module, oc_bin, oc_conf):
+    '''Get list of nodes currently ready vi oc'''
+    # json output is necessary for consistency here.
+    command = "{} {} get nodes -ojson".format(oc_bin, oc_conf)
+    stdout = run_command(module, command)
+
+    try:
+        data = json.loads(stdout)
+    except JSONDecodeError as err:
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': str(err),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+
+    ready_nodes = []
+    for node in data['items']:
+        if node.get('status') and node['status'].get('conditions'):
+            for condition in node['status']['conditions']:
+                # "True" is a string here, not a boolean.
+                if condition['type'] == "Ready" and condition['status'] == 'True':
+                    ready_nodes.append(node['metadata']['name'])
+    return ready_nodes
+
+
+def get_csrs(module, oc_bin, oc_conf):
+    '''Retrieve csrs from cluster using oc get csr -ojson'''
+    command = "{} {} get csr -ojson".format(oc_bin, oc_conf)
+    stdout = run_command(module, command)
+    try:
+        data = json.loads(stdout)
+    except JSONDecodeError as err:
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': str(err),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+    return data['items']
+
+
+def parse_subject_cn(subject_str):
+    '''parse output of openssl req -noout -subject to retrieve CN.
+       example input:
+         'subject=/C=US/CN=test.io/L=Raleigh/O=Red Hat/ST=North Carolina/OU=OpenShift\n'
+         or
+         'subject=C = US, CN = test.io, L = City, O = Company, ST = State, OU = Dept\n'
+       example output: 'test.io'
+    '''
+    stripped_string = subject_str[len('subject='):].strip()
+    kv_strings = [x.strip() for x in stripped_string.split(',')]
+    if len(kv_strings) == 1:
+        kv_strings = [x.strip() for x in stripped_string.split('/')][1:]
+    for item in kv_strings:
+        item_parts = [x.strip() for x in item.split('=')]
+        if item_parts[0] == 'CN':
+            return item_parts[1]
+
+
+def process_csrs(module, csrs, node_list, mode):
+    '''Return a dictionary of pending csrs where the format of the dict is
+       k=csr name, v=Subject Common Name'''
+    csr_dict = {}
+    for item in csrs:
+        status = item['status'].get('conditions')
+        if status:
+            # If status is not an empty dictionary, cert is not pending.
+            continue
+        if CERT_MODE[mode] not in item['spec']['usages']:
+            continue
+        name = item['metadata']['name']
+        request_data = base64.b64decode(item['spec']['request'])
+        command = "openssl req -noout -subject"
+        # ansible's module.run_command accepts data to pipe via stdin as
+        # as 'data' kwarg.
+        rc_opts = {'data': request_data, 'binary_data': True}
+        stdout = run_command(module, command, rc_opts=rc_opts)
+        # parse common_name from subject string.
+        common_name = parse_subject_cn(stdout)
+        if common_name and common_name.startswith('system:node:'):
+            # common name is typically prepended with system:node:.
+            common_name = common_name.split('system:node:')[1]
+        # we only want to approve csrs from nodes we know about.
+        if common_name in node_list:
+            csr_dict[name] = common_name
+
+    return csr_dict
+
+
+def confirm_needed_requests_present(module, not_ready_nodes, csr_dict):
+    '''Ensure all non-Ready nodes have a csr, or fail'''
+    nodes_needed = set(not_ready_nodes)
+    for _, val in csr_dict.items():
+        nodes_needed.discard(val)
+
+    # check that we found all of our needed nodes
+    if nodes_needed:
+        missing_nodes = ', '.join(nodes_needed)
+        result = {'failed': True,
+                  'changed': False,
+                  'msg': "Cound not find csr for nodes: {}".format(missing_nodes),
+                  'state': 'unknown'}
+        module.fail_json(**result)
+
+
+def approve_csrs(module, oc_bin, oc_conf, csr_pending_list, mode):
+    '''Loop through csr_pending_list and call:
+       oc adm certificate approve <item>'''
+    res_mode = "{}_approve_results".format(mode)
+    base_command = "{} {} adm certificate approve {}"
+    approve_results = []
+    for csr in csr_pending_list:
+        command = base_command.format(oc_bin, oc_conf, csr)
+        rtnc, stdout, err = module.run_command(command)
+        approve_results.append(stdout)
+        if rtnc:
+            result = {'failed': True,
+                      'changed': False,
+                      'msg': str(err),
+                      res_mode: approve_results,
+                      'state': 'unknown'}
+            module.fail_json(**result)
+    return approve_results
+
+
+def get_ready_nodes_server(module, oc_bin, oc_conf, nodes_list):
+    '''Determine which nodes have working server certificates'''
+    ready_nodes_server = []
+    base_command = "{} {} get --raw /api/v1/nodes/{}/proxy/healthz"
+    for node in nodes_list:
+        # need this to look like /api/v1/nodes/<node>/proxy/healthz
+        command = base_command.format(oc_bin, oc_conf, node)
+        rtnc, _, _ = module.run_command(command)
+        if not rtnc:
+            # if we can hit that api endpoint, the node has a valid server
+            # cert.
+            ready_nodes_server.append(node)
+    return ready_nodes_server
+
+
+def verify_server_csrs(module, result, oc_bin, oc_conf, node_list):
+    '''We approved some server csrs, now we need to validate they are working.
+       This function will attempt to retry 10 times in case of failure.'''
+    # Attempt to try node endpoints a few times.
+    attempts = 0
+    # Find not_ready_nodes for server-side again
+    nodes_server_ready = get_ready_nodes_server(module, oc_bin, oc_conf,
+                                                node_list)
+    # Create list of nodes that still aren't ready.
+    not_ready_nodes_server = set([item for item in node_list if item not in nodes_server_ready])
+    while not_ready_nodes_server:
+        nodes_server_ready = get_ready_nodes_server(module, oc_bin, oc_conf,
+                                                    not_ready_nodes_server)
+        # if we have same number of nodes_server_ready now, all of the previous
+        # not_ready_nodes are now ready.
+        if len(nodes_server_ready) == len(not_ready_nodes_server):
+            break
+        attempts += 1
+        if attempts > 9:
+            result['failed'] = True
+            result['rc'] = 1
+            missing_nodes = not_ready_nodes_server - set(nodes_server_ready)
+            msg = "Some nodes still not ready after approving server certs: {}"
+            msg = msg.format(", ".join(missing_nodes))
+            result['msg'] = msg
+
+
+def run_module():
+    '''Run this module'''
+    module_args = dict(
+        oc_bin=dict(type='path', required=False, default='oc'),
+        oc_conf=dict(type='path', required=False, default='/etc/origin/master/admin.kubeconfig'),
+        node_list=dict(type='list', required=True),
+    )
+    module = AnsibleModule(
+        supports_check_mode=False,
+        argument_spec=module_args
+    )
+    oc_bin = module.params['oc_bin']
+    oc_conf = '--config={}'.format(module.params['oc_conf'])
+    node_list = module.params['node_list']
+
+    result = {'changed': False, 'rc': 0}
+
+    nodes_ready = get_ready_nodes(module, oc_bin, oc_conf)
+    # don't need to check nodes that are already ready.
+    not_ready_nodes = [item for item in node_list if item not in nodes_ready]
+
+    # Get all csrs, no good way to filter on pending.
+    csrs = get_csrs(module, oc_bin, oc_conf)
+
+    # process data in csrs and build a dictionary of client requests
+    csr_dict = process_csrs(module, csrs, node_list, "client")
+
+    # This method is fail-happy and expects all non-Ready nodes have available
+    # csrs.  Handle failure for this method via ansible retry/until.
+    confirm_needed_requests_present(module, not_ready_nodes, csr_dict)
+
+    # save client_approve_results so we can report later.
+    client_approve_results = approve_csrs(module, oc_bin, oc_conf, csr_dict,
+                                          'client')
+    result['client_approve_results'] = client_approve_results
+
+    # # Server Cert Section # #
+    # Find not_ready_nodes for server-side
+    nodes_server_ready = get_ready_nodes_server(module, oc_bin, oc_conf,
+                                                node_list)
+    # Create list of nodes that definitely need a server cert approved.
+    not_ready_nodes_server = [item for item in node_list if item not in nodes_server_ready]
+
+    # Get all csrs again, no good way to filter on pending.
+    csrs = get_csrs(module, oc_bin, oc_conf)
+
+    # process data in csrs and build a dictionary of server requests
+    csr_dict = process_csrs(module, csrs, node_list, "server")
+
+    # This will fail if all server csrs are not present, but probably shouldn't
+    # at this point since we spent some time hitting the api to see if the
+    # nodes are already responding.
+    confirm_needed_requests_present(module, not_ready_nodes_server, csr_dict)
+    server_approve_results = approve_csrs(module, oc_bin, oc_conf, csr_dict,
+                                          'server')
+    result['server_approve_results'] = server_approve_results
+
+    result['changed'] = bool(client_approve_results) or bool(server_approve_results)
+
+    verify_server_csrs(module, result, oc_bin, oc_conf, node_list)
+
+    module.exit_json(**result)
+
+
+def main():
+    '''main'''
+    run_module()
+
+
+if __name__ == '__main__':
+    main()

File diff suppressed because it is too large
+ 85 - 0
roles/lib_openshift/test/test_data/oc_csr_approve_approved.json


+ 9 - 0
roles/lib_openshift/test/test_data/oc_csr_approve_none.json

@@ -0,0 +1,9 @@
+{
+    "apiVersion": "v1",
+    "items": [],
+    "kind": "List",
+    "metadata": {
+        "resourceVersion": "",
+        "selfLink": ""
+    }
+}

File diff suppressed because it is too large
+ 38 - 0
roles/lib_openshift/test/test_data/oc_csr_approve_pending.json


File diff suppressed because it is too large
+ 361 - 0
roles/lib_openshift/test/test_data/oc_csr_server_multiple_pends_one_host.json


+ 450 - 0
roles/lib_openshift/test/test_data/oc_get_nodes.json

@@ -0,0 +1,450 @@
+{
+    "apiVersion": "v1",
+    "items": [
+        {
+            "apiVersion": "v1",
+            "kind": "Node",
+            "metadata": {
+                "annotations": {
+                    "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+                },
+                "creationTimestamp": "2018-08-10T23:50:59Z",
+                "labels": {
+                    "beta.kubernetes.io/arch": "amd64",
+                    "beta.kubernetes.io/os": "linux",
+                    "glusterfs": "storage-host",
+                    "kubernetes.io/hostname": "fedora1.openshift.io",
+                    "node-role.kubernetes.io/compute": "true",
+                    "node-role.kubernetes.io/infra": "true",
+                    "node-role.kubernetes.io/master": "true"
+                },
+                "name": "fedora1.openshift.io",
+                "namespace": "",
+                "resourceVersion": "1732411",
+                "selfLink": "/api/v1/nodes/fedora1.openshift.io",
+                "uid": "3b52eed5-9cf8-11e8-964a-525400650cba"
+            },
+            "spec": {
+                "externalID": "fedora1.openshift.io"
+            },
+            "status": {
+                "addresses": [
+                    {
+                        "address": "192.168.124.106",
+                        "type": "InternalIP"
+                    },
+                    {
+                        "address": "fedora1.openshift.io",
+                        "type": "Hostname"
+                    }
+                ],
+                "allocatable": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8070076Ki",
+                    "pods": "250"
+                },
+                "capacity": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8172476Ki",
+                    "pods": "250"
+                },
+                "conditions": [
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has sufficient disk space available",
+                        "reason": "KubeletHasSufficientDisk",
+                        "status": "False",
+                        "type": "OutOfDisk"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has sufficient memory available",
+                        "reason": "KubeletHasSufficientMemory",
+                        "status": "False",
+                        "type": "MemoryPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has no disk pressure",
+                        "reason": "KubeletHasNoDiskPressure",
+                        "status": "False",
+                        "type": "DiskPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:50:53Z",
+                        "message": "kubelet has sufficient PID available",
+                        "reason": "KubeletHasSufficientPID",
+                        "status": "False",
+                        "type": "PIDPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-11T00:01:06Z",
+                        "message": "kubelet is posting ready status",
+                        "reason": "KubeletReady",
+                        "status": "True",
+                        "type": "Ready"
+                    }
+                ],
+                "daemonEndpoints": {
+                    "kubeletEndpoint": {
+                        "Port": 10250
+                    }
+                },
+                "images": [
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-node@sha256:d8e0b4e5912e12e84ccd2b72a90ce66ce6e5569dfcc62f9cd69f0315d59c6a91",
+                            "docker.io/openshift/origin-node:v3.10",
+                            "docker.io/openshift/origin-node:v3.10.0"
+                        ],
+                        "sizeBytes": 1281495850
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-control-plane@sha256:8a030a68593d64703c0572454d3fd9475bcfadf5d26d2899f92418516c1c49be",
+                            "docker.io/openshift/origin-control-plane:v3.10",
+                            "docker.io/openshift/origin-control-plane:v3.10.0"
+                        ],
+                        "sizeBytes": 815862538
+                    },
+                    {
+                        "names": [
+                            "docker.io/gluster/gluster-centos@sha256:850fd2399d254f678b40bebe1602aa0c46d60facc7804b922c81c1524e05903a",
+                            "docker.io/gluster/gluster-centos:latest"
+                        ],
+                        "sizeBytes": 328338103
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-pod@sha256:6ae0714fe9bf19f1312e2a869bc3d7b7cd01aea330c33675f1e215e3de857385",
+                            "docker.io/openshift/origin-pod:v3.10.0"
+                        ],
+                        "sizeBytes": 222597999
+                    },
+                    {
+                        "names": [
+                            "quay.io/coreos/etcd@sha256:43fbc8a457aa0cb887da63d74a48659e13947cb74b96a53ba8f47abb6172a948",
+                            "quay.io/coreos/etcd:v3.2.22"
+                        ],
+                        "sizeBytes": 37269372
+                    }
+                ],
+                "nodeInfo": {
+                    "architecture": "amd64",
+                    "bootID": "fc58c6b9-9f67-4377-8cbe-57f0c3f7a517",
+                    "containerRuntimeVersion": "docker://1.13.1",
+                    "kernelVersion": "4.13.9-300.fc27.x86_64",
+                    "kubeProxyVersion": "v1.10.0+b81c8f8",
+                    "kubeletVersion": "v1.10.0+b81c8f8",
+                    "machineID": "57f56a8c5aeb47a98ca1fd94281c64aa",
+                    "operatingSystem": "linux",
+                    "osImage": "Fedora 27 (Cloud Edition)",
+                    "systemUUID": "57F56A8C-5AEB-47A9-8CA1-FD94281C64AA"
+                }
+            }
+        },
+        {
+            "apiVersion": "v1",
+            "kind": "Node",
+            "metadata": {
+                "annotations": {
+                    "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+                },
+                "creationTimestamp": "2018-08-10T23:53:32Z",
+                "labels": {
+                    "beta.kubernetes.io/arch": "amd64",
+                    "beta.kubernetes.io/os": "linux",
+                    "glusterfs": "storage-host",
+                    "kubernetes.io/hostname": "fedora2.openshift.io",
+                    "node-role.kubernetes.io/infra": "true"
+                },
+                "name": "fedora2.openshift.io",
+                "namespace": "",
+                "resourceVersion": "1732413",
+                "selfLink": "/api/v1/nodes/fedora2.openshift.io",
+                "uid": "965edafb-9cf8-11e8-964a-525400650cba"
+            },
+            "spec": {
+                "externalID": "fedora2.openshift.io"
+            },
+            "status": {
+                "addresses": [
+                    {
+                        "address": "192.168.124.48",
+                        "type": "InternalIP"
+                    },
+                    {
+                        "address": "fedora2.openshift.io",
+                        "type": "Hostname"
+                    }
+                ],
+                "allocatable": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8070076Ki",
+                    "pods": "250"
+                },
+                "capacity": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8172476Ki",
+                    "pods": "250"
+                },
+                "conditions": [
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:40:58Z",
+                        "message": "kubelet has sufficient disk space available",
+                        "reason": "KubeletHasSufficientDisk",
+                        "status": "False",
+                        "type": "OutOfDisk"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:40:58Z",
+                        "message": "kubelet has sufficient memory available",
+                        "reason": "KubeletHasSufficientMemory",
+                        "status": "False",
+                        "type": "MemoryPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:40:58Z",
+                        "message": "kubelet has no disk pressure",
+                        "reason": "KubeletHasNoDiskPressure",
+                        "status": "False",
+                        "type": "DiskPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient PID available",
+                        "reason": "KubeletHasSufficientPID",
+                        "status": "False",
+                        "type": "PIDPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:17Z",
+                        "lastTransitionTime": "2018-08-22T21:41:08Z",
+                        "message": "kubelet is posting ready status",
+                        "reason": "KubeletReady",
+                        "status": "False",
+                        "type": "Ready"
+                    }
+                ],
+                "daemonEndpoints": {
+                    "kubeletEndpoint": {
+                        "Port": 10250
+                    }
+                },
+                "images": [
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-node@sha256:d8e0b4e5912e12e84ccd2b72a90ce66ce6e5569dfcc62f9cd69f0315d59c6a91",
+                            "docker.io/openshift/origin-node:v3.10",
+                            "docker.io/openshift/origin-node:v3.10.0"
+                        ],
+                        "sizeBytes": 1281495850
+                    },
+                    {
+                        "names": [
+                            "docker.io/heketi/heketi@sha256:d847e721966c6b6b09a50cbe3ec209d7d6cf4ad7cca204cf114028c98a39aecd",
+                            "docker.io/heketi/heketi:latest"
+                        ],
+                        "sizeBytes": 361586900
+                    },
+                    {
+                        "names": [
+                            "docker.io/gluster/gluster-centos@sha256:850fd2399d254f678b40bebe1602aa0c46d60facc7804b922c81c1524e05903a",
+                            "docker.io/gluster/gluster-centos:latest"
+                        ],
+                        "sizeBytes": 328338103
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-pod@sha256:6ae0714fe9bf19f1312e2a869bc3d7b7cd01aea330c33675f1e215e3de857385",
+                            "docker.io/openshift/origin-pod:v3.10.0"
+                        ],
+                        "sizeBytes": 222597999
+                    }
+                ],
+                "nodeInfo": {
+                    "architecture": "amd64",
+                    "bootID": "9bced612-abc1-4129-8d92-b17e786df8dd",
+                    "containerRuntimeVersion": "docker://1.13.1",
+                    "kernelVersion": "4.13.9-300.fc27.x86_64",
+                    "kubeProxyVersion": "v1.10.0+b81c8f8",
+                    "kubeletVersion": "v1.10.0+b81c8f8",
+                    "machineID": "a883f7e82e0645578114dafea6fca8bb",
+                    "operatingSystem": "linux",
+                    "osImage": "Fedora 27 (Cloud Edition)",
+                    "systemUUID": "A883F7E8-2E06-4557-8114-DAFEA6FCA8BB"
+                }
+            }
+        },
+        {
+            "apiVersion": "v1",
+            "kind": "Node",
+            "metadata": {
+                "annotations": {
+                    "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+                },
+                "creationTimestamp": "2018-08-10T23:53:32Z",
+                "labels": {
+                    "beta.kubernetes.io/arch": "amd64",
+                    "beta.kubernetes.io/os": "linux",
+                    "glusterfs": "storage-host",
+                    "kubernetes.io/hostname": "fedora3.openshift.io",
+                    "node-role.kubernetes.io/infra": "true"
+                },
+                "name": "fedora3.openshift.io",
+                "namespace": "",
+                "resourceVersion": "1732410",
+                "selfLink": "/api/v1/nodes/fedora3.openshift.io",
+                "uid": "9646e307-9cf8-11e8-964a-525400650cba"
+            },
+            "spec": {
+                "externalID": "fedora3.openshift.io"
+            },
+            "status": {
+                "addresses": [
+                    {
+                        "address": "192.168.124.171",
+                        "type": "InternalIP"
+                    },
+                    {
+                        "address": "fedora3.openshift.io",
+                        "type": "Hostname"
+                    }
+                ],
+                "allocatable": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8070068Ki",
+                    "pods": "250"
+                },
+                "capacity": {
+                    "cpu": "4",
+                    "hugepages-1Gi": "0",
+                    "hugepages-2Mi": "0",
+                    "memory": "8172468Ki",
+                    "pods": "250"
+                },
+                "conditions": [
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient disk space available",
+                        "reason": "KubeletHasSufficientDisk",
+                        "status": "False",
+                        "type": "OutOfDisk"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient memory available",
+                        "reason": "KubeletHasSufficientMemory",
+                        "status": "False",
+                        "type": "MemoryPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has no disk pressure",
+                        "reason": "KubeletHasNoDiskPressure",
+                        "status": "False",
+                        "type": "DiskPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-10T23:53:32Z",
+                        "message": "kubelet has sufficient PID available",
+                        "reason": "KubeletHasSufficientPID",
+                        "status": "False",
+                        "type": "PIDPressure"
+                    },
+                    {
+                        "lastHeartbeatTime": "2018-08-23T20:01:16Z",
+                        "lastTransitionTime": "2018-08-11T00:01:06Z",
+                        "message": "kubelet is posting ready status",
+                        "reason": "KubeletReady",
+                        "status": "True",
+                        "type": "Ready"
+                    }
+                ],
+                "daemonEndpoints": {
+                    "kubeletEndpoint": {
+                        "Port": 10250
+                    }
+                },
+                "images": [
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-node@sha256:d8e0b4e5912e12e84ccd2b72a90ce66ce6e5569dfcc62f9cd69f0315d59c6a91",
+                            "docker.io/openshift/origin-node:v3.10",
+                            "docker.io/openshift/origin-node:v3.10.0"
+                        ],
+                        "sizeBytes": 1281495850
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-deployer@sha256:72d013cdfdf3d16557b64ac0a459c2fc4e90e37422ceed1564a2f69d68607e2a",
+                            "docker.io/openshift/origin-deployer:v3.10.0"
+                        ],
+                        "sizeBytes": 815862538
+                    },
+                    {
+                        "names": [
+                            "docker.io/heketi/heketi@sha256:e6d0362d217573a3f92792e14c611d75df04eb7bc8f245e8c44c4a9c3a870ee1",
+                            "docker.io/heketi/heketi:latest"
+                        ],
+                        "sizeBytes": 384664289
+                    },
+                    {
+                        "names": [
+                            "docker.io/gluster/gluster-centos@sha256:850fd2399d254f678b40bebe1602aa0c46d60facc7804b922c81c1524e05903a",
+                            "docker.io/gluster/gluster-centos:latest"
+                        ],
+                        "sizeBytes": 328338103
+                    },
+                    {
+                        "names": [
+                            "docker.io/openshift/origin-pod@sha256:6ae0714fe9bf19f1312e2a869bc3d7b7cd01aea330c33675f1e215e3de857385",
+                            "docker.io/openshift/origin-pod:v3.10.0"
+                        ],
+                        "sizeBytes": 222597999
+                    }
+                ],
+                "nodeInfo": {
+                    "architecture": "amd64",
+                    "bootID": "a81e3aa0-bf11-432d-b671-aa7d86344c3f",
+                    "containerRuntimeVersion": "docker://1.13.1",
+                    "kernelVersion": "4.13.9-300.fc27.x86_64",
+                    "kubeProxyVersion": "v1.10.0+b81c8f8",
+                    "kubeletVersion": "v1.10.0+b81c8f8",
+                    "machineID": "95bf4677a2ac4f8daa29a31efdb09eed",
+                    "operatingSystem": "linux",
+                    "osImage": "Fedora 27 (Cloud Edition)",
+                    "systemUUID": "95BF4677-A2AC-4F8D-AA29-A31EFDB09EED"
+                }
+            }
+        }
+    ],
+    "kind": "List",
+    "metadata": {
+        "resourceVersion": "",
+        "selfLink": ""
+    }
+}

+ 1 - 0
roles/lib_openshift/test/test_data/openssl1.txt

@@ -0,0 +1 @@
+subject=C = US, CN = fedora1.openshift.io, L = Raleigh, O = Red Hat, ST = North Carolina, OU = OpenShift

+ 162 - 0
roles/lib_openshift/test/test_oc_csr_approve.py

@@ -0,0 +1,162 @@
+import os
+import sys
+
+import pytest
+
+from ansible.module_utils.basic import AnsibleModule
+
+try:
+    # python3, mock is built in.
+    from unittest.mock import patch
+except ImportError:
+    # In python2, mock is installed via pip.
+    from mock import patch
+
+MODULE_PATH = os.path.realpath(os.path.join(__file__, os.pardir, os.pardir, 'library'))
+sys.path.insert(1, MODULE_PATH)
+
+import oc_csr_approve  # noqa
+
+# base path for text files with sample outputs.
+ASSET_PATH = os.path.realpath(os.path.join(__file__, os.pardir, 'test_data'))
+
+RUN_CMD_MOCK = 'ansible.module_utils.basic.AnsibleModule.run_command'
+
+
+class DummyModule(AnsibleModule):
+    def _load_params(self):
+        self.params = {}
+
+    def exit_json(*args, **kwargs):
+        return 0
+
+    def fail_json(*args, **kwargs):
+        raise Exception(kwargs['msg'])
+
+
+def test_parse_subject_cn():
+    subject = 'subject=/C=US/CN=fedora1.openshift.io/L=Raleigh/O=Red Hat/ST=North Carolina/OU=OpenShift\n'
+    assert oc_csr_approve.parse_subject_cn(subject) == 'fedora1.openshift.io'
+
+    subject = 'subject=C = US, CN = test.io, L = City, O = Company, ST = State, OU = Dept\n'
+    assert oc_csr_approve.parse_subject_cn(subject) == 'test.io'
+
+
+def test_get_ready_nodes():
+    output_file = os.path.join(ASSET_PATH, 'oc_get_nodes.json')
+    with open(output_file) as stdoutfile:
+        oc_get_nodes_stdout = stdoutfile.read()
+
+    module = DummyModule({})
+
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, oc_get_nodes_stdout, '')
+        ready_nodes = oc_csr_approve.get_ready_nodes(module, 'oc', '/dev/null')
+    print(ready_nodes)
+    assert ready_nodes == ['fedora1.openshift.io', 'fedora3.openshift.io']
+
+
+def test_get_csrs():
+    module = DummyModule({})
+    output_file = os.path.join(ASSET_PATH, 'oc_csr_approve_pending.json')
+    with open(output_file) as stdoutfile:
+        oc_get_csr_out = stdoutfile.read()
+
+    # mock oc get csr call to cluster
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, oc_get_csr_out, '')
+        csrs = oc_csr_approve.get_csrs(module, 'oc', '/dev/null')
+
+    assert csrs[0]['kind'] == "CertificateSigningRequest"
+
+    output_file = os.path.join(ASSET_PATH, 'openssl1.txt')
+    with open(output_file) as stdoutfile:
+        openssl_out = stdoutfile.read()
+
+    # mock openssl req call.
+    node_list = ['fedora2.mguginolocal.com']
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, openssl_out, '')
+        csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "client")
+    # actually run openssl req call.
+    csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "client")
+    assert csr_dict['node-csr-TkefytQp8Dz4Xp7uzcw605MocvI0gWuEOGNrHhOjGNQ'] == 'fedora2.mguginolocal.com'
+
+
+def test_confirm_needed_requests_present():
+    module = DummyModule({})
+    csr_dict = {'some-csr': 'fedora1.openshift.io'}
+    not_ready_nodes = ['host1']
+    with pytest.raises(Exception) as err:
+        oc_csr_approve.confirm_needed_requests_present(
+            module, not_ready_nodes, csr_dict)
+    assert 'Exception: Cound not find csr for nodes: host1' in str(err)
+
+    not_ready_nodes = ['fedora1.openshift.io']
+    # this should complete silently
+    oc_csr_approve.confirm_needed_requests_present(
+        module, not_ready_nodes, csr_dict)
+
+
+def test_approve_csrs():
+    module = DummyModule({})
+    oc_bin = 'oc'
+    oc_conf = '/dev/null'
+    csr_dict = {'csr-1': 'example.openshift.io'}
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, 'csr-1 ok', '')
+        client_approve_results = oc_csr_approve.approve_csrs(
+            module, oc_bin, oc_conf, csr_dict, 'client')
+    assert client_approve_results == ['csr-1 ok']
+
+
+def test_get_ready_nodes_server():
+    module = DummyModule({})
+    oc_bin = 'oc'
+    oc_conf = '/dev/null'
+    nodes_list = ['fedora1.openshift.io']
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, 'ok', '')
+        ready_nodes_server = oc_csr_approve.get_ready_nodes_server(
+            module, oc_bin, oc_conf, nodes_list)
+    assert ready_nodes_server == ['fedora1.openshift.io']
+
+
+def test_get_csrs_server():
+    module = DummyModule({})
+    output_file = os.path.join(ASSET_PATH, 'oc_csr_server_multiple_pends_one_host.json')
+    with open(output_file) as stdoutfile:
+        oc_get_csr_out = stdoutfile.read()
+
+    # mock oc get csr call to cluster
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, oc_get_csr_out, '')
+        csrs = oc_csr_approve.get_csrs(module, 'oc', '/dev/null')
+
+    assert csrs[0]['kind'] == "CertificateSigningRequest"
+
+    output_file = os.path.join(ASSET_PATH, 'openssl1.txt')
+    with open(output_file) as stdoutfile:
+        openssl_out = stdoutfile.read()
+
+    node_list = ['fedora1.openshift.io']
+
+    # mock openssl req call.
+    with patch(RUN_CMD_MOCK) as call_mock:
+        call_mock.return_value = (0, openssl_out, '')
+        csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "server")
+
+    # actually run openssl req call.
+    node_list = ['fedora2.mguginolocal.com']
+    csr_dict = oc_csr_approve.process_csrs(module, csrs, node_list, "server")
+    assert csr_dict['csr-2cxkp'] == 'fedora2.mguginolocal.com'
+
+
+if __name__ == '__main__':
+    test_parse_subject_cn()
+    test_get_ready_nodes()
+    test_get_csrs()
+    test_confirm_needed_requests_present()
+    test_approve_csrs()
+    test_get_ready_nodes_server()
+    test_get_csrs_server()

+ 8 - 7
roles/openshift_aws/tasks/accept_nodes.yml

@@ -32,11 +32,12 @@
   debug:
     msg: "{{ mastersout.instances[0].public_ip_address }}"
 
-- name: approve nodes
-  oc_adm_csr:
-    #approve_all: True
-    nodes: "{{ instancesout.instances|map(attribute='private_dns_name') | list  }}"
-    timeout: 60
-    fail_on_timeout: "{{ openshift_aws_node_accept_fail_on_timeout | default(false) | bool }}"
-  register: nodeout
+- name: Approve node certificates when bootstrapping
+  oc_csr_approve:
+    oc_bin: "{{ hostvars[groups.masters.0]['first_master_client_binary'] }}"
+    oc_conf: "{{ hostvars[groups.masters.0].openshift.common.config_base }}/master/admin.kubeconfig"
+    node_list: "{{ instancesout.instances|map(attribute='private_dns_name') | list  }}"
+  register: aws_csr_approve
+  retries: 30
+  until: aws_csr_approve is succeeded
   delegate_to: "{{ groups.masters.0 }}"

+ 8 - 5
roles/openshift_gcp/tasks/configure_master_bootstrap.yml

@@ -29,9 +29,12 @@
     name: "openshift-bootstrap-update.timer"
     state: started
 
-- name: Bootstrap all nodes that were identified with bootstrap metadata
-  run_once: true
-  oc_adm_csr:
-    nodes: "{{ groups['all'] | map('extract', hostvars) | selectattr('gce_metadata.bootstrap', 'match', 'true') | map(attribute='gce_name') | list }}"
-    timeout: 60
+- name: Approve node certificates when bootstrapping
+  oc_csr_approve:
+    oc_bin: "{{ hostvars[groups.masters.0]['first_master_client_binary'] }}"
+    oc_conf: "{{ hostvars[groups.masters.0].openshift.common.config_base }}/master/admin.kubeconfig"
+    node_list: "{{ groups['all'] | map('extract', hostvars) | selectattr('gce_metadata.bootstrap', 'match', 'true') | map(attribute='gce_name') | list }}"
+  register: gcp_csr_approve
+  retries: 30
+  until: gcp_csr_approve is succeeded
   when: groups['all'] | map('extract', hostvars) | selectattr('gce_metadata.bootstrap', 'match', 'true') | map(attribute='gce_name') | list | length > 0

+ 8 - 25
roles/openshift_node/tasks/upgrade.yml

@@ -57,32 +57,15 @@
 - import_tasks: upgrade/restart.yml
 
 - name: Approve node certificates when bootstrapping
-  oc_adm_csr:
-    nodes: "{{ openshift.node.nodename | lower }}"
-    timeout: 180
-    fail_on_timeout: true
+  oc_csr_approve:
+    oc_bin: "{{ hostvars[groups.oo_first_master.0]['first_master_client_binary'] }}"
+    oc_conf: "{{ openshift.common.config_base }}/master/admin.kubeconfig"
+    node_list:
+    - "{{ openshift.node.nodename | lower }}"
   delegate_to: "{{ groups.oo_first_master.0 }}"
-  ignore_errors: true
-
-- name: Wait for node to be ready
-  oc_obj:
-    state: list
-    kind: node
-    name: "{{ openshift.node.nodename | lower }}"
-  register: node_output
-  delegate_to: "{{ groups.oo_first_master.0 }}"
-  until:
-  - node_output.results is defined
-  - node_output.results.returncode is defined
-  - node_output.results.returncode == 0
-  - node_output.results.results is defined
-  - node_output.results.results | length  > 0
-  - node_output.results.results[0].status is defined
-  - node_output.results.results[0].status.conditions is defined
-  - node_output.results.results[0].status.conditions | selectattr('type', 'match', '^Ready$') | map(attribute='status') | join | bool == True
-  # Give the node three minutes to come back online.
-  retries: 36
-  delay: 5
+  register: node_upgrade_oc_csr_approve
+  retries: 30
+  until: node_upgrade_oc_csr_approve is succeeded
 
 - import_tasks: journald.yml