Selaa lähdekoodia

Add CL role and playbook for Node Problem Detector

Joel Smith 7 vuotta sitten
vanhempi
commit
d3b29259b9

+ 9 - 0
playbooks/openshift-node-problem-detector/config.yml

@@ -0,0 +1,9 @@
+---
+- import_playbook: ../init/main.yml
+  vars:
+    l_init_fact_hosts: "oo_masters_to_config"
+    l_openshift_version_set_hosts: "oo_masters_to_config:!oo_first_master"
+    l_openshift_version_check_hosts: "all:!all"
+    l_sanity_check_hosts: "{{ groups['oo_masters_to_config'] }}"
+
+- import_playbook: private/config.yml

+ 30 - 0
playbooks/openshift-node-problem-detector/private/config.yml

@@ -0,0 +1,30 @@
+---
+- name: Node Problem Detector Install Checkpoint Start
+  hosts: all
+  gather_facts: false
+  tasks:
+  - name: Set Node Problem Detector install 'In Progress'
+    run_once: true
+    set_stats:
+      data:
+        installer_phase_node_problem_detector:
+          status: "In Progress"
+          start: "{{ lookup('pipe', 'date +%Y%m%d%H%M%SZ') }}"
+
+- name: OpenShift Node Problem Detector
+  hosts: oo_first_master
+  roles:
+  - role: openshift_facts
+  - role: openshift_node_problem_detector
+
+- name: Node Problem Detector End
+  hosts: all
+  gather_facts: false
+  tasks:
+  - name: Set Node Problem Detector install 'Complete'
+    run_once: true
+    set_stats:
+      data:
+        installer_phase_node_problem_detector:
+          status: "Complete"
+          end: "{{ lookup('pipe', 'date +%Y%m%d%H%M%SZ') }}"

+ 1 - 0
playbooks/openshift-node-problem-detector/private/roles

@@ -0,0 +1 @@
+../../../roles

+ 10 - 0
playbooks/openshift-node-problem-detector/private/uninstall.yml

@@ -0,0 +1,10 @@
+---
+- name: Uninstall Node Problem Detector
+  hosts: oo_first_master
+  vars:
+    openshift_node_problem_detector_state: absent
+  tasks:
+  - name: Run the Node Problem Detector Uninstall Role Tasks
+    include_role:
+      name: openshift_node_problem_detector
+      tasks_from: uninstall

+ 9 - 0
playbooks/openshift-node-problem-detector/private/upgrade.yml

@@ -0,0 +1,9 @@
+---
+- name: Upgrade Node Problem Detector
+  hosts: oo_first_master
+  roles:
+  - role: openshift_facts
+  tasks:
+  - import_role:
+      name: openshift_node_problem_detector
+      tasks_from: upgrade.yaml

+ 9 - 0
playbooks/openshift-node-problem-detector/uninstall.yml

@@ -0,0 +1,9 @@
+---
+- import_playbook: ../init/main.yml
+  vars:
+    l_init_fact_hosts: "oo_masters_to_config"
+    l_openshift_version_set_hosts: "oo_masters_to_config:!oo_first_master"
+    l_openshift_version_check_hosts: "all:!all"
+    l_sanity_check_hosts: "{{ groups['oo_masters_to_config'] }}"
+
+- import_playbook: private/uninstall.yml

+ 9 - 0
playbooks/openshift-node-problem-detector/upgrade.yml

@@ -0,0 +1,9 @@
+---
+- import_playbook: ../init/main.yml
+  vars:
+    l_init_fact_hosts: "oo_masters_to_config"
+    l_openshift_version_set_hosts: "oo_masters_to_config:!oo_first_master"
+    l_openshift_version_check_hosts: "all:!all"
+    l_sanity_check_hosts: "{{ groups['oo_masters_to_config'] }}"
+
+- import_playbook: private/upgrade.yml

+ 31 - 0
roles/openshift_node_problem_detector/README.md

@@ -0,0 +1,31 @@
+Openshift Node Problem Detector
+===============================
+
+Install the Node Problem Detector
+
+Role Variables
+--------------
+Check defaults/main.yml
+
+
+Example Playbook
+----------------
+
+#!/usr/bin/ansible-playbook
+
+Notes
+-----
+
+This is currently experimental software.  This role allows users to install the Node Problem Detector and creates a service account with enough permissions to run it.
+
+https://github.com/openshift/node-problem-detector
+
+License
+-------
+
+Apache License, Version 2.0
+
+Author Information
+------------------
+
+Openshift

+ 31 - 0
roles/openshift_node_problem_detector/defaults/main.yaml

@@ -0,0 +1,31 @@
+---
+# node_problem_detector common setup
+openshift_node_problem_detector_state: present
+openshift_node_problem_detector_namespace: openshift-infra
+openshift_node_problem_detector_tmp_location: /tmp
+openshift_node_problem_detector_delete_tempfiles: True
+
+# node-problem-detector image setup
+openshift_node_problem_detector_image_dict:
+  origin:
+    prefix: "docker.io/openshift/"
+    version: "{{ openshift_image_tag }}"
+  openshift-enterprise:
+    prefix: "registry.access.redhat.com/openshift3/ose-"
+    version: "{{ openshift_image_tag }}"
+
+openshift_node_problem_detector_image_prefix: "{{ openshift_node_problem_detector_image_dict[openshift_deployment_type]['prefix'] }}"
+openshift_node_problem_detector_image_version: "{{ openshift_node_problem_detector_image_dict[openshift_deployment_type]['version'] }}"
+
+
+# node_problem_detector daemonset setup
+openshift_node_problem_detector_daemonset_name: node-problem-detector
+openshift_node_problem_detector_daemonset_template_file: templates/node-problem-detector-daemonset.yaml.j2
+
+# node_problem_detector service account setup
+openshift_node_problem_detector_service_account: node-problem-detector
+openshift_node_problem_detector_cluster_role_name: node-problem-detector
+
+# node problem detector configmap setup
+openshift_node_problem_detector_configmap_name: node-problem-detector
+openshift_node_problem_detector_configmap_filename: files/node-problem-detector-configmap.yaml

+ 81 - 0
roles/openshift_node_problem_detector/files/node-problem-detector-configmap.yaml

@@ -0,0 +1,81 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: npd-config
+data:
+  docker-monitor.json: |
+    {
+        "plugin": "journald",
+        "pluginConfig": {
+                "source": "docker"
+        },
+        "logPath": "/host/log/journal",
+        "lookback": "5m",
+        "bufferSize": 10,
+        "source": "docker-monitor",
+        "conditions": [],
+        "rules": [
+                {
+                        "type": "temporary",
+                        "reason": "CorruptDockerImage",
+                        "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
+                }
+        ]
+    }
+  kernel-monitor.json: |
+    {
+        "plugin": "journald",
+        "pluginConfig": {
+                "source": "kernel"
+        },
+        "logPath": "/host/log/journal",
+        "lookback": "5m",
+        "bufferSize": 10,
+        "source": "kernel-monitor",
+        "conditions": [
+                {
+                        "type": "KernelDeadlock",
+                        "reason": "KernelHasNoDeadlock",
+                        "message": "kernel has no deadlock"
+                }
+        ],
+        "rules": [
+                {
+                        "type": "temporary",
+                        "reason": "OOMKilling",
+                        "pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB"
+                },
+                {
+                        "type": "temporary",
+                        "reason": "TaskHung",
+                        "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
+                },
+                {
+                        "type": "temporary",
+                        "reason": "UnregisterNetDevice",
+                        "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
+                },
+                {
+                        "type": "temporary",
+                        "reason": "KernelOops",
+                        "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
+                },
+                {
+                        "type": "temporary",
+                        "reason": "KernelOops",
+                        "pattern": "divide error: 0000 \\[#\\d+\\] SMP"
+                },
+                {
+                        "type": "permanent",
+                        "condition": "KernelDeadlock",
+                        "reason": "AUFSUmountHung",
+                        "pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
+                },
+                {
+                        "type": "permanent",
+                        "condition": "KernelDeadlock",
+                        "reason": "DockerHung",
+                        "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
+                }
+        ]
+    }

+ 5 - 0
roles/openshift_node_problem_detector/meta/main.yaml

@@ -0,0 +1,5 @@
+---
+dependencies:
+- role: lib_openshift
+- role: openshift_facts
+- role: lib_utils

+ 49 - 0
roles/openshift_node_problem_detector/tasks/install.yaml

@@ -0,0 +1,49 @@
+---
+- name: create Node Problem Detector service account
+  oc_serviceaccount:
+    name: "{{ openshift_node_problem_detector_service_account }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    state: present
+
+- name: bind node-problem-detector cluster role to the node-problem-detector service account
+  oc_adm_policy_user:
+    state: present
+    user: "system:serviceaccount:{{ openshift_node_problem_detector_namespace }}:{{ openshift_node_problem_detector_service_account }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    resource_kind: cluster-role
+    resource_name: "{{ openshift_node_problem_detector_cluster_role_name }}"
+
+- name: Grant privileged SCC from node problem detector service account
+  oc_adm_policy_user:
+    state: present
+    user: "system:serviceaccount:{{ openshift_node_problem_detector_namespace }}:{{ openshift_node_problem_detector_service_account }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    resource_kind: scc
+    resource_name: privileged
+
+- name: read node problem detector configmap
+  include_vars:
+    name: openshift_node_problem_detector_configmap
+    file: "{{ openshift_node_problem_detector_configmap_filename }}"
+
+- name: create node problem detector configuration configmap
+  oc_configmap:
+    state: present
+    name: "{{ openshift_node_problem_detector_configmap_name }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    from_literal: "{{ openshift_node_problem_detector_configmap.data }}"
+
+- name: create node problem detector
+  template:
+    src: "{{ openshift_node_problem_detector_daemonset_template_file }}"
+    dest: "{{ openshift_node_problem_detector_tmp_location }}/npd-ds.yaml"
+
+- name: create node problem detector daemonset
+  oc_obj:
+    kind: daemonset
+    name: "{{ openshift_node_problem_detector_daemonset_name }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    state: present
+    files:
+    - "{{ openshift_node_problem_detector_tmp_location }}/npd-ds.yaml"
+    delete_after: "{{ openshift_node_problem_detector_delete_tempfiles }}"

+ 6 - 0
roles/openshift_node_problem_detector/tasks/main.yaml

@@ -0,0 +1,6 @@
+---
+- include_tasks: install.yaml
+  when: openshift_node_problem_detector_state == 'present'
+
+- include_tasks: uninstall.yaml
+  when: openshift_node_problem_detector_state == 'absent'

+ 35 - 0
roles/openshift_node_problem_detector/tasks/uninstall.yaml

@@ -0,0 +1,35 @@
+---
+- name: Ensure the node problem detector is absent
+  oc_obj:
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    state: absent
+    kind: daemonset
+    name: "{{ openshift_node_problem_detector_daemonset_name }}"
+
+- name: Remove privileged SCC from node problem detector service account
+  oc_adm_policy_user:
+    state: absent
+    user: "system:serviceaccount:{{ openshift_node_problem_detector_namespace }}:{{ openshift_node_problem_detector_service_account }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    resource_kind: scc
+    resource_name: privileged
+
+- name: remove binding of node-problem-detector cluster role to the node-problem-detector service account
+  oc_adm_policy_user:
+    state: absent
+    user: "system:serviceaccount:{{ openshift_node_problem_detector_namespace }}:{{ openshift_node_problem_detector_service_account }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    resource_kind: cluster-role
+    resource_name: "{{ openshift_node_problem_detector_cluster_role_name }}"
+
+- name: remove node problem detector service account
+  oc_serviceaccount:
+    name: "{{ openshift_node_problem_detector_service_account }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"
+    state: absent
+
+- name: remove node problem detector configuration configmap
+  oc_configmap:
+    state: absent
+    name: "{{ openshift_node_problem_detector_configmap_name }}"
+    namespace: "{{ openshift_node_problem_detector_namespace }}"

+ 13 - 0
roles/openshift_node_problem_detector/tasks/upgrade.yaml

@@ -0,0 +1,13 @@
+---
+- name: Check if node problem detector daemonset exists
+  oc_obj:
+    state: list
+    kind: daemonset
+    name: "{{ openshift_node_problem_detector_daemonset_name }}"
+    namespace: "{{ openshift_node_problem_detector_daemonset_name }}"
+  register: npd_daemonset
+
+- name: Upgrade node problem detector daemonset
+  include_tasks: install_daemonset.yaml
+  when:
+  - npd_daemonset.results.results[0] != {}

+ 57 - 0
roles/openshift_node_problem_detector/templates/node-problem-detector-daemonset.yaml.j2

@@ -0,0 +1,57 @@
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  labels:
+    app: {{ openshift_node_problem_detector_daemonset_name }}
+  name: {{ openshift_node_problem_detector_daemonset_name }}
+spec:
+  template:
+    metadata:
+      labels:
+        name: {{ openshift_node_problem_detector_daemonset_name }}
+    spec:
+      containers:
+      - command:
+        - node-problem-detector
+        - --logtostderr
+        - --system-log-monitors=/etc/npd/kernel-monitor.json,/etc/npd/docker-monitor.json
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: spec.nodeName
+        image: {{ openshift_node_problem_detector_image_prefix }}node-problem-detector:{{ openshift_node_problem_detector_image_version }}
+        imagePullPolicy: Always
+        name: {{ openshift_node_problem_detector_daemonset_name }}
+        resources: {}
+        securityContext:
+          privileged: true
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /host/log
+          name: log
+          readOnly: true
+        - mountPath: /etc/localtime
+          name: localtime
+          readOnly: true
+        - mountPath: /etc/npd
+          name: config
+      restartPolicy: Always
+      securityContext: {}
+      serviceAccount: {{ openshift_node_problem_detector_service_account }}
+      serviceAccountName: {{ openshift_node_problem_detector_service_account }}
+      terminationGracePeriodSeconds: 30
+      volumes:
+      - hostPath:
+          path: /var/log/
+        name: log
+      - hostPath:
+          path: /etc/localtime
+        name: localtime
+      - configMap:
+          name: {{ openshift_node_problem_detector_configmap_name }}
+        name: config
+  updateStrategy:
+    type: RollingUpdate