Browse Source

Merge pull request #4509 from zgalor/prometheus-role

Create ansible role for deploying prometheus on openshift
Scott Dodson 7 years ago
parent
commit
bb59f5617b

+ 14 - 1
filter_plugins/oo_filters.py

@@ -1024,6 +1024,18 @@ def oo_contains_rule(source, apiGroups, resources, verbs):
     return False
 
 
+def oo_selector_to_string_list(user_dict):
+    """Convert a dict of selectors to a key=value list of strings
+
+Given input of {'region': 'infra', 'zone': 'primary'} returns a list
+of items as ['region=infra', 'zone=primary']
+    """
+    selectors = []
+    for key in user_dict:
+        selectors.append("{}={}".format(key, user_dict[key]))
+    return selectors
+
+
 class FilterModule(object):
     """ Custom ansible filter mapping """
 
@@ -1065,5 +1077,6 @@ class FilterModule(object):
             "oo_openshift_loadbalancer_backends": oo_openshift_loadbalancer_backends,
             "to_padded_yaml": to_padded_yaml,
             "oo_random_word": oo_random_word,
-            "oo_contains_rule": oo_contains_rule
+            "oo_contains_rule": oo_contains_rule,
+            "oo_selector_to_string_list": oo_selector_to_string_list
         }

+ 4 - 0
playbooks/byo/openshift-cluster/openshift-prometheus.yml

@@ -0,0 +1,4 @@
+---
+- include: initialize_groups.yml
+
+- include: ../../common/openshift-cluster/openshift_prometheus.yml

+ 3 - 0
playbooks/common/openshift-cluster/openshift_hosted.yml

@@ -49,6 +49,9 @@
   - role: cockpit-ui
     when: ( openshift.common.version_gte_3_3_or_1_3  | bool ) and ( openshift_hosted_manage_registry | default(true) | bool ) and not (openshift.docker.hosted_registry_insecure | default(false) | bool)
 
+  - role: openshift_prometheus
+    when: openshift_hosted_prometheus_deploy | default(false) | bool
+
 - name: Update master-config for publicLoggingURL
   hosts: oo_masters_to_config:!oo_first_master
   tags:

+ 9 - 0
playbooks/common/openshift-cluster/openshift_prometheus.yml

@@ -0,0 +1,9 @@
+---
+- include: std_include.yml
+
+- name: OpenShift Prometheus
+  hosts: oo_first_master
+  roles:
+  - openshift_prometheus
+  vars:
+    openshift_prometheus_state: present

+ 95 - 0
roles/openshift_prometheus/README.md

@@ -0,0 +1,95 @@
+OpenShift Prometheus
+====================
+
+OpenShift Prometheus Installation
+
+Requirements
+------------
+
+
+Role Variables
+--------------
+
+For default values, see [`defaults/main.yaml`](defaults/main.yaml).
+
+- `openshift_prometheus_state`: present - install/update. absent - uninstall.
+
+- `openshift_prometheus_namespace`: project (i.e. namespace) where the components will be
+  deployed.
+
+- `openshift_prometheus_replicas`: The number of replicas for prometheus deployment.
+
+- `openshift_prometheus_node_selector`: Selector for the nodes prometheus will be deployed on.
+
+- `openshift_prometheus_image_<COMPONENT>`: specify image for the component 
+
+## Storage related variables
+Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can set pv claim by setting corresponding role variable:
+```
+openshift_prometheus_<COMPONENT>_storage_type: <VALUE>
+openshift_prometheus_<COMPONENT>_pvc_(name|size|access_modes|pv_selector): <VALUE>
+```
+e.g
+```
+openshift_prometheus_storage_type: pvc
+openshift_prometheus_alertmanager_pvc_name: alertmanager
+openshift_prometheus_alertbuffer_pvc_size: 10G
+openshift_prometheus_pvc_access_modes: [ReadWriteOnce]
+```
+
+## Additional Alert Rules file variable
+An external file with alert rules can be added by setting path to additional rules variable: 
+```
+openshift_prometheus_additional_rules_file: <PATH> 
+```
+
+File content should be in prometheus alert rules format.
+Following example sets rule to fire an alert when one of the cluster nodes is down:
+
+```
+groups:
+- name: example-rules
+  interval: 30s # defaults to global interval
+  rules:
+  - alert: Node Down
+    expr: up{job="kubernetes-nodes"} == 0
+    annotations:
+      miqTarget: "ContainerNode"
+      severity: "HIGH"
+      message: "{{ '{{' }}{{ '$labels.instance' }}{{ '}}' }} is down"
+```
+
+
+## Additional variables to control resource limits
+Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can specify a cpu and memory limits and requests by setting
+the corresponding role variable:
+```
+openshift_prometheus_<COMPONENT>_(limits|requests)_(memory|cpu): <VALUE>
+```
+e.g
+```
+openshift_prometheus_alertmanager_limits_memory: 1Gi
+openshift_prometheus_oath_proxy_requests_cpu: 100
+```
+
+Dependencies
+------------
+
+openshift_facts
+
+
+Example Playbook
+----------------
+
+```
+- name: Configure openshift-prometheus
+  hosts: oo_first_master
+  roles:
+  - role: openshift_prometheus
+```
+
+License
+-------
+
+Apache License, Version 2.0
+

+ 74 - 0
roles/openshift_prometheus/defaults/main.yaml

@@ -0,0 +1,74 @@
+---
+# defaults file for openshift_prometheus
+openshift_prometheus_state: present
+
+openshift_prometheus_namespace: prometheus
+
+openshift_prometheus_replicas: 1
+openshift_prometheus_node_selector: {"region":"infra"}
+
+# images
+openshift_prometheus_image_proxy: "openshift/oauth-proxy:v1.0.0"
+openshift_prometheus_image_prometheus: "openshift/prometheus:v2.0.0-dev"
+openshift_prometheus_image_alertmanager: "openshift/prometheus-alertmanager:dev"
+openshift_prometheus_image_alertbuffer: "ilackarms/message-buffer"
+
+# additional prometheus rules file
+openshift_prometheus_additional_rules_file: null
+
+# All the required exports
+openshift_prometheus_pv_exports:
+  - prometheus
+  - prometheus-alertmanager
+  - prometheus-alertbuffer
+# PV template files and their created object names
+openshift_prometheus_pv_data:
+  - pv_name: prometheus
+    pv_template: prom-pv-server.yml
+    pv_label: Prometheus Server PV
+  - pv_name: prometheus-alertmanager
+    pv_template: prom-pv-alertmanager.yml
+    pv_label: Prometheus Alertmanager PV
+  - pv_name: prometheus-alertbuffer
+    pv_template: prom-pv-alertbuffer.yml
+    pv_label: Prometheus Alert Buffer PV
+
+# Hostname/IP of the NFS server. Currently defaults to first master
+openshift_prometheus_nfs_server: "{{ groups.nfs.0 }}"
+
+# storage
+openshift_prometheus_storage_type: pvc
+openshift_prometheus_pvc_name: prometheus
+openshift_prometheus_pvc_size: 10G
+openshift_prometheus_pvc_access_modes: [ReadWriteOnce]
+openshift_prometheus_pvc_pv_selector: {}
+
+openshift_prometheus_alertmanager_storage_type: pvc
+openshift_prometheus_alertmanager_pvc_name: prometheus-alertmanager
+openshift_prometheus_alertmanager_pvc_size: 10G
+openshift_prometheus_alertmanager_pvc_access_modes: [ReadWriteOnce]
+openshift_prometheus_alertmanager_pvc_pv_selector: {}
+
+openshift_prometheus_alertbuffer_storage_type: pvc
+openshift_prometheus_alertbuffer_pvc_name: prometheus-alertbuffer
+openshift_prometheus_alertbuffer_pvc_size: 10G
+openshift_prometheus_alertbuffer_pvc_access_modes: [ReadWriteOnce]
+openshift_prometheus_alertbuffer_pvc_pv_selector: {}
+
+# container resources
+openshift_prometheus_cpu_limit: null
+openshift_prometheus_memory_limit: null
+openshift_prometheus_cpu_requests: null
+openshift_prometheus_memory_requests: null
+openshift_prometheus_alertmanager_cpu_limit: null
+openshift_prometheus_alertmanager_memory_limit: null
+openshift_prometheus_alertmanager_cpu_requests: null
+openshift_prometheus_alertmanager_memory_requests: null
+openshift_prometheus_alertbuffer_cpu_limit: null
+openshift_prometheus_alertbuffer_memory_limit: null
+openshift_prometheus_alertbuffer_cpu_requests: null
+openshift_prometheus_alertbuffer_memory_requests: null
+openshift_prometheus_oauth_proxy_cpu_limit: null
+openshift_prometheus_oauth_proxy_memory_limit: null
+openshift_prometheus_oauth_proxy_cpu_requests: null
+openshift_prometheus_oauth_proxy_memory_requests: null

+ 3 - 0
roles/openshift_prometheus/files/openshift_prometheus.exports

@@ -0,0 +1,3 @@
+/exports/prometheus *(rw,no_root_squash,no_wdelay)
+/exports/prometheus-alertmanager *(rw,no_root_squash,no_wdelay)
+/exports/prometheus-alertbuffer *(rw,no_root_squash,no_wdelay)

+ 19 - 0
roles/openshift_prometheus/meta/main.yaml

@@ -0,0 +1,19 @@
+---
+galaxy_info:
+  author: OpenShift Development <dev@lists.openshift.redhat.com>
+  description: Deploy OpenShift prometheus integration for the cluster
+  company: Red Hat, Inc.
+  license: license (Apache)
+  min_ansible_version: 2.2
+  platforms:
+  - name: EL
+    versions:
+    - 7
+  - name: Fedora
+    versions:
+    - all
+  categories:
+  - openshift
+dependencies:
+- { role: lib_openshift }
+- { role: openshift_facts }

+ 36 - 0
roles/openshift_prometheus/tasks/create_pvs.yaml

@@ -0,0 +1,36 @@
+---
+# Check for existance and then conditionally:
+# - evaluate templates
+# - PVs
+#
+# These tasks idempotently create required Prometheus PV objects. Do not
+# call this file directly. This file is intended to be ran as an
+# include that has a 'with_items' attached to it. Hence the use below
+# of variables like "{{ item.pv_label }}"
+
+- name: "Check if the {{ item.pv_label }} template has been created already"
+  oc_obj:
+    namespace: "{{ openshift_prometheus_namespace }}"
+    state: list
+    kind: pv
+    name: "{{ item.pv_name }}"
+  register: prom_pv_check
+
+# Skip all of this if the PV already exists
+- block:
+    - name: "Ensure the {{ item.pv_label }} template is evaluated"
+      template:
+        src: "{{ item.pv_template }}.j2"
+        dest: "{{ tempdir }}/templates/{{ item.pv_template }}"
+
+    - name: "Ensure {{ item.pv_label }} is created"
+      oc_obj:
+        namespace: "{{ openshift_prometheus_namespace }}"
+        kind: pv
+        name: "{{ item.pv_name }}"
+        state: present
+        delete_after: True
+        files:
+          - "{{ tempdir }}/templates/{{ item.pv_template }}"
+  when:
+    - not prom_pv_check.results.results.0

+ 241 - 0
roles/openshift_prometheus/tasks/install_prometheus.yaml

@@ -0,0 +1,241 @@
+---
+
+# namespace
+- name: Add prometheus project
+  oc_project:
+    state: "{{ state }}"
+    name: "{{ openshift_prometheus_namespace }}"
+    node_selector: "{{ openshift_prometheus_node_selector | oo_selector_to_string_list() }}"
+    description: Prometheus
+
+# secrets
+- name: Set alert and prometheus secrets
+  oc_secret:
+    state: "{{ state }}"
+    name: "{{ item }}-proxy"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    contents:
+      - path: session_secret
+        data: "{{ 43 | oo_random_word }}="
+  with_items:
+    - prometheus
+    - alerts
+
+# serviceaccount
+- name: create prometheus serviceaccount
+  oc_serviceaccount:
+    state: "{{ state }}"
+    name: prometheus
+    namespace: "{{ openshift_prometheus_namespace }}"
+    #    TODO add annotations when supproted
+    #    annotations:
+    #      serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+    #      serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+
+    secrets:
+      - prometheus-secrets
+  changed_when: no
+
+# TODO remove this when annotations are supported by oc_serviceaccount
+- name: annotate serviceaccount
+  command: >
+    {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
+    serviceaccount prometheus
+    serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+    serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+
+
+# create clusterrolebinding for prometheus serviceaccount
+- name: Set cluster-reader permissions for prometheus
+  oc_adm_policy_user:
+    state: "{{ state }}"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    resource_kind: cluster-role
+    resource_name: cluster-reader
+    user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus"
+
+
+######################################################################
+# NFS
+# In the case that we are not running on a cloud provider, volumes must be statically provisioned
+
+- include: nfs.yaml
+  when: not (openshift_cloudprovider_kind is defined and (openshift_cloudprovider_kind == 'aws' or openshift_cloudprovider_kind == 'gce'))
+
+
+# create prometheus and alerts services
+# TODO join into 1 task with loop
+- name: Create prometheus service
+  oc_service:
+    state: "{{ state }}"
+    name: "{{ item.name }}"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    selector:
+      app: prometheus
+    labels:
+      name: "{{ item.name }}"
+      #    TODO add annotations when supported
+      #    annotations:
+      #      service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+    ports:
+      - port: 443
+        targetPort: 8443
+  with_items:
+    - name: prometheus
+
+- name: Create alerts service
+  oc_service:
+    state: "{{ state }}"
+    name: "{{ item.name }}"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    selector:
+      app: prometheus
+    labels:
+      name: "{{ item.name }}"
+      #    TODO add annotations when supported
+      #    annotations:
+      #      service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+    ports:
+      - port: 443
+        targetPort: 9443
+  with_items:
+    - name: alerts
+
+
+# Annotate services with secret name
+# TODO remove this when annotations are supported by oc_service
+- name: annotate prometheus service
+  command: >
+    {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
+    service prometheus 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls'
+
+- name: annotate alerts service
+  command: >
+    {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
+    service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls'
+
+# create prometheus and alerts routes
+- name: create prometheus and alerts routes
+  oc_route:
+    state: "{{ state }}"
+    name: "{{ item.name }}"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    service_name: "{{ item.name }}"
+    tls_termination: reencrypt
+  with_items:
+    - name: prometheus
+    - name: alerts
+
+# Storage
+- name: create prometheus pvc
+  oc_pvc:
+    namespace: "{{ openshift_prometheus_namespace }}"
+    name: "{{ openshift_prometheus_pvc_name }}"
+    access_modes: "{{ openshift_prometheus_pvc_access_modes }}"
+    volume_capacity: "{{ openshift_prometheus_pvc_size }}"
+    selector: "{{ openshift_prometheus_pvc_pv_selector }}"
+
+- name: create alertmanager pvc
+  oc_pvc:
+    namespace: "{{ openshift_prometheus_namespace }}"
+    name: "{{ openshift_prometheus_alertmanager_pvc_name }}"
+    access_modes: "{{ openshift_prometheus_alertmanager_pvc_access_modes }}"
+    volume_capacity: "{{ openshift_prometheus_alertmanager_pvc_size }}"
+    selector: "{{ openshift_prometheus_alertmanager_pvc_pv_selector }}"
+
+- name: create alertbuffer pvc
+  oc_pvc:
+    namespace: "{{ openshift_prometheus_namespace }}"
+    name: "{{ openshift_prometheus_alertbuffer_pvc_name }}"
+    access_modes: "{{ openshift_prometheus_alertbuffer_pvc_access_modes }}"
+    volume_capacity: "{{ openshift_prometheus_alertbuffer_pvc_size }}"
+    selector: "{{ openshift_prometheus_alertbuffer_pvc_pv_selector }}"
+
+# create prometheus deployment
+- name: Set prometheus deployment template
+  template:
+    src: prometheus_deployment.j2
+    dest: "{{ tempdir }}/templates/prometheus.yaml"
+  vars:
+    namespace: "{{ openshift_prometheus_namespace }}"
+    prom_replicas: "{{ openshift_prometheus_replicas }}"
+
+- name: Set prometheus deployment
+  oc_obj:
+    state: "{{ state }}"
+    name: "prometheus"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    kind: deployment
+    files:
+      - "{{ tempdir }}/templates/prometheus.yaml"
+    delete_after: true
+
+# prometheus configmap
+# Copy the additional rules file if it is defined
+- name: Copy additional rules file to host
+  copy:
+    src: "{{ openshift_prometheus_additional_rules_file }}"
+    dest: "{{ tempdir }}/prometheus.additional.rules"
+  when:
+    - openshift_prometheus_additional_rules_file is defined
+    - openshift_prometheus_additional_rules_file is not none
+    - openshift_prometheus_additional_rules_file | trim | length > 0
+
+- stat:
+    path: "{{ tempdir }}/prometheus.additional.rules"
+  register: additional_rules_stat
+
+# The kubernetes version impacts the prometheus scraping endpoint
+# so gathering it before constructing the configmap
+- name: get oc version
+  oc_version:
+  register: oc_version
+
+- set_fact:
+    kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
+
+- template:
+    src: prometheus.yml.j2
+    dest: "{{ tempdir }}/prometheus.yml"
+  changed_when: no
+
+- template:
+    src: prometheus.rules.j2
+    dest: "{{ tempdir }}/prometheus.rules"
+  changed_when: no
+
+# In prometheus configmap create "additional.rules" section if file exists
+- name: Set prometheus configmap
+  oc_configmap:
+    state: "{{ state }}"
+    name: "prometheus"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    from_file:
+      prometheus.rules: "{{ tempdir }}/prometheus.rules"
+      prometheus.additional.rules: "{{ tempdir }}/prometheus.additional.rules"
+      prometheus.yml: "{{ tempdir }}/prometheus.yml"
+  when: additional_rules_stat.stat.exists == True
+
+- name: Set prometheus configmap
+  oc_configmap:
+    state: "{{ state }}"
+    name: "prometheus"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    from_file:
+      prometheus.rules: "{{ tempdir }}/prometheus.rules"
+      prometheus.yml: "{{ tempdir }}/prometheus.yml"
+  when: additional_rules_stat.stat.exists == False
+
+# alertmanager configmap
+- template:
+    src: alertmanager.yml.j2
+    dest: "{{ tempdir }}/alertmanager.yml"
+  changed_when: no
+
+- name: Set alertmanager configmap
+  oc_configmap:
+    state: "{{ state }}"
+    name: "prometheus-alerts"
+    namespace: "{{ openshift_prometheus_namespace }}"
+    from_file:
+      alertmanager.yml: "{{ tempdir }}/alertmanager.yml"

+ 26 - 0
roles/openshift_prometheus/tasks/main.yaml

@@ -0,0 +1,26 @@
+---
+
+- name: Create temp directory for doing work in on target
+  command: mktemp -td openshift-prometheus-ansible-XXXXXX
+  register: mktemp
+  changed_when: False
+
+- set_fact:
+    tempdir: "{{ mktemp.stdout }}"
+
+- name: Create templates subdirectory
+  file:
+    state: directory
+    path: "{{ tempdir }}/templates"
+    mode: 0755
+  changed_when: False
+
+- include: install_prometheus.yaml
+  vars:
+    state: "{{ openshift_prometheus_state }}"
+
+- name: Delete temp directory
+  file:
+    name: "{{ tempdir }}"
+    state: absent
+  changed_when: False

+ 44 - 0
roles/openshift_prometheus/tasks/nfs.yaml

@@ -0,0 +1,44 @@
+---
+# Tasks to statically provision NFS volumes
+# Include if not using dynamic volume provisioning
+- name: Ensure the /exports/ directory exists
+  file:
+    path: /exports/
+    state: directory
+    mode: 0755
+    owner: root
+    group: root
+
+- name: Ensure the prom-pv0X export directories exist
+  file:
+    path: "/exports/{{ item }}"
+    state: directory
+    mode: 0777
+    owner: nfsnobody
+    group: nfsnobody
+  with_items: "{{ openshift_prometheus_pv_exports }}"
+
+- name: Ensure the NFS exports for Prometheus PVs exist
+  copy:
+    src: openshift_prometheus.exports
+    dest: /etc/exports.d/openshift_prometheus.exports
+  register: nfs_exports_updated
+
+- name: Ensure the NFS export table is refreshed if exports were added
+  command: exportfs -ar
+  when:
+    - nfs_exports_updated.changed
+
+
+######################################################################
+# Create the required Prometheus PVs. Check out these online docs if you
+# need a refresher on includes looping with items:
+# * http://docs.ansible.com/ansible/playbooks_loops.html#loops-and-includes-in-2-0
+# * http://stackoverflow.com/a/35128533
+#
+# TODO: Handle the case where a PV template is updated in
+# openshift-ansible and the change needs to be landed on the managed
+# cluster.
+
+- include: create_pvs.yaml
+  with_items: "{{ openshift_prometheus_pv_data }}"

+ 20 - 0
roles/openshift_prometheus/templates/alertmanager.yml.j2

@@ -0,0 +1,20 @@
+global:
+
+# The root route on which each incoming alert enters.
+route:
+  # default route if none match
+  receiver: alert-buffer-wh
+
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  # TODO:
+  group_by: []
+
+  # All the above attributes are inherited by all child routes and can
+  # overwritten on each.
+
+receivers:
+- name: alert-buffer-wh
+  webhook_configs:
+  - url: http://localhost:9099/topics/alerts

+ 15 - 0
roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2

@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: prometheus-alertbuffer
+  labels:
+    storage: prometheus-alertbuffer
+spec:
+  capacity:
+    storage: 15Gi
+  accessModes:
+    - ReadWriteOnce
+  nfs:
+    path: /exports/prometheus-alertbuffer
+    server: {{ openshift_prometheus_nfs_server }}
+  persistentVolumeReclaimPolicy: Retain

+ 15 - 0
roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2

@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: prometheus-alertmanager
+  labels:
+    storage: prometheus-alertmanager
+spec:
+  capacity:
+    storage: 15Gi
+  accessModes:
+    - ReadWriteOnce
+  nfs:
+    path: /exports/prometheus-alertmanager
+    server: {{ openshift_prometheus_nfs_server }}
+  persistentVolumeReclaimPolicy: Retain

+ 15 - 0
roles/openshift_prometheus/templates/prom-pv-server.yml.j2

@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: prometheus
+  labels:
+    storage: prometheus
+spec:
+  capacity:
+    storage: 15Gi
+  accessModes:
+    - ReadWriteOnce
+  nfs:
+    path: /exports/prometheus
+    server: {{ openshift_prometheus_nfs_server }}
+  persistentVolumeReclaimPolicy: Retain

+ 4 - 0
roles/openshift_prometheus/templates/prometheus.rules.j2

@@ -0,0 +1,4 @@
+groups:
+- name: example-rules
+  interval: 30s # defaults to global interval
+  rules:

+ 174 - 0
roles/openshift_prometheus/templates/prometheus.yml.j2

@@ -0,0 +1,174 @@
+rule_files:
+  - 'prometheus.rules'
+{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
+  - 'prometheus.additional.rules'
+{% endif %}
+
+
+
+# A scrape configuration for running Prometheus on a Kubernetes cluster.
+# This uses separate scrape configs for cluster components (i.e. API server, node)
+# and services to allow each to use different authentication configs.
+#
+# Kubernetes labels will be added as Prometheus labels on metrics via the
+# `labelmap` relabeling action.
+
+# Scrape config for API servers.
+#
+# Kubernetes exposes API servers as endpoints to the default/kubernetes
+# service so this uses `endpoints` role and uses relabelling to only keep
+# the endpoints associated with the default/kubernetes service using the
+# default named port `https`. This works for single API server deployments as
+# well as HA API server deployments.
+scrape_configs:
+- job_name: 'kubernetes-apiservers'
+
+  kubernetes_sd_configs:
+  - role: endpoints
+
+  scheme: https
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+  # Keep only the default/kubernetes service endpoints for the https port. This
+  # will add targets for each API server which Kubernetes adds an endpoint to
+  # the default/kubernetes service.
+  relabel_configs:
+  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+    action: keep
+    regex: default;kubernetes;https
+
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+
+  scheme: https
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+  kubernetes_sd_configs:
+  - role: node
+
+  relabel_configs:
+  - action: labelmap
+    regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for controllers.
+#
+# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
+# the controllers.
+#
+# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
+#       endpoints.
+- job_name: 'kubernetes-controllers'
+
+  scheme: https
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+  kubernetes_sd_configs:
+  - role: endpoints
+
+  # Keep only the default/kubernetes service endpoints for the https port, and then
+  # set the port to 8444. This is the default configuration for the controllers on OpenShift
+  # masters.
+  relabel_configs:
+  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+    action: keep
+    regex: default;kubernetes;https
+  - source_labels: [__address__]
+    action: replace
+    target_label: __address__
+    regex: (.+)(?::\d+)
+    replacement: $1:8444
+
+# Scrape config for cAdvisor.
+#
+# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
+# reports container metrics for each running pod. Scrape those by default.
+- job_name: 'kubernetes-cadvisor'
+
+  scheme: https
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+{% if kubernetes_version | float() >= 1.7 | float() %}
+  metrics_path: /metrics/cadvisor
+{% else %}
+  metrics_path: /metrics
+{% endif %}
+
+  kubernetes_sd_configs:
+  - role: node
+
+  relabel_configs:
+  - action: labelmap
+    regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for service endpoints.
+#
+# The relabeling allows the actual service scrape endpoint to be configured
+# via the following annotations:
+#
+# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
+# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# to set this to `https` & most likely set the `tls_config` of the scrape config.
+# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# * `prometheus.io/port`: If the metrics are exposed on a different port to the
+# service then set this appropriately.
+- job_name: 'kubernetes-service-endpoints'
+
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    # TODO: this should be per target
+    insecure_skip_verify: true
+
+  kubernetes_sd_configs:
+  - role: endpoints
+
+  relabel_configs:
+  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+    action: keep
+    regex: true
+  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+    action: replace
+    target_label: __scheme__
+    regex: (https?)
+  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+    action: replace
+    target_label: __metrics_path__
+    regex: (.+)
+  - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+    action: replace
+    target_label: __address__
+    regex: (.+)(?::\d+);(\d+)
+    replacement: $1:$2
+  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
+    action: replace
+    target_label: __basic_auth_username__
+    regex: (.+)
+  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
+    action: replace
+    target_label: __basic_auth_password__
+    regex: (.+)
+  - action: labelmap
+    regex: __meta_kubernetes_service_label_(.+)
+  - source_labels: [__meta_kubernetes_namespace]
+    action: replace
+    target_label: kubernetes_namespace
+  - source_labels: [__meta_kubernetes_service_name]
+    action: replace
+    target_label: kubernetes_name
+
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets:
+      - "localhost:9093"

+ 240 - 0
roles/openshift_prometheus/templates/prometheus_deployment.j2

@@ -0,0 +1,240 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: prometheus
+  namespace: {{ namespace }}
+  labels:
+    app: prometheus
+spec:
+  replicas: {{ prom_replicas|default(1) }}
+  selector:
+    provider: openshift
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      name: prometheus
+      labels:
+        app: prometheus
+    spec:
+      serviceAccountName: prometheus
+{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
+      nodeSelector:
+{% for key, value in openshift_prometheus_node_selector.iteritems() %}
+        {{key}}: "{{value}}"
+{% endfor %}
+{% endif %}
+      containers:
+      # Deploy Prometheus behind an oauth proxy
+      - name: prom-proxy
+        image: "{{ openshift_prometheus_image_proxy }}"
+        imagePullPolicy: IfNotPresent
+        resources:
+          requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+            memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+            cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}"
+{% endif %}
+          limits:
+{% if openshift_prometheus_memory_requests_limit_proxy is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+            memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+            cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}"
+{% endif %}
+        ports:
+        - containerPort: 8443
+          name: web
+        args:
+        - -provider=openshift
+        - -https-address=:8443
+        - -http-address=
+        - -email-domain=*
+        - -upstream=http://localhost:9090
+        - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+        - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+        - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+        - -tls-cert=/etc/tls/private/tls.crt
+        - -tls-key=/etc/tls/private/tls.key
+        - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+        - -cookie-secret-file=/etc/proxy/secrets/session_secret
+        - -skip-auth-regex=^/metrics
+        volumeMounts:
+        - mountPath: /etc/tls/private
+          name: prometheus-tls
+        - mountPath: /etc/proxy/secrets
+          name: prometheus-secrets
+        - mountPath: /prometheus
+          name: prometheus-data
+
+      - name: prometheus
+        args:
+        - --storage.tsdb.retention=6h
+        - --config.file=/etc/prometheus/prometheus.yml
+        - --web.listen-address=localhost:9090
+        image: "{{ openshift_prometheus_image_prometheus }}"
+        imagePullPolicy: IfNotPresent
+        resources:
+          requests:
+{% if openshift_prometheus_memory_requests is defined and openshift_prometheus_memory_requests is not none %}
+            memory: "{{openshift_prometheus_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_cpu_requests is defined and openshift_prometheus_cpu_requests is not none %}
+            cpu: "{{openshift_prometheus_cpu_requests}}"
+{% endif %}
+          limits:
+{% if openshift_prometheus_memory_limit is defined and openshift_prometheus_memory_limit is not none %}
+            memory: "{{ openshift_prometheus_memory_limit }}"
+{% endif %}
+{% if openshift_prometheus_cpu_limit is defined and openshift_prometheus_cpu_limit is not none %}
+            cpu: "{{openshift_prometheus_cpu_limit}}"
+{% endif %}
+
+        volumeMounts:
+        - mountPath: /etc/prometheus
+          name: prometheus-config
+        - mountPath: /prometheus
+          name: prometheus-data
+
+      # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy
+      - name: alerts-proxy
+        image: "{{ openshift_prometheus_image_proxy }}"
+        imagePullPolicy: IfNotPresent
+        resources:
+          requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+            memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+            cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}"
+{% endif %}
+          limits:
+{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+            memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+            cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}"
+{% endif %}
+        ports:
+        - containerPort: 9443
+          name: web
+        args:
+        - -provider=openshift
+        - -https-address=:9443
+        - -http-address=
+        - -email-domain=*
+        - -upstream=http://localhost:9099
+        - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+        - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+        - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+        - -tls-cert=/etc/tls/private/tls.crt
+        - -tls-key=/etc/tls/private/tls.key
+        - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+        - -cookie-secret-file=/etc/proxy/secrets/session_secret
+        volumeMounts:
+        - mountPath: /etc/tls/private
+          name: alerts-tls
+        - mountPath: /etc/proxy/secrets
+          name: alerts-secrets
+
+      - name: alert-buffer
+        args:
+        - --storage-path=/alert-buffer/messages.db
+        image: "{{ openshift_prometheus_image_alertbuffer }}"
+        imagePullPolicy: IfNotPresent
+        resources:
+          requests:
+{% if openshift_prometheus_alertbuffer_memory_requests is defined and openshift_prometheus_alertbuffer_memory_requests is not none %}
+            memory: "{{openshift_prometheus_alertbuffer_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_alertbuffer_cpu_requests is defined and openshift_prometheus_alertbuffer_cpu_requests is not none %}
+            cpu: "{{openshift_prometheus_alertbuffer_cpu_requests}}"
+{% endif %}
+          limits:
+{% if openshift_prometheus_alertbuffer_memory_limit is defined and openshift_prometheus_alertbuffer_memory_limit is not none %}
+            memory: "{{openshift_prometheus_alertbuffer_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_alertbuffer_cpu_limit is defined and openshift_prometheus_alertbuffer_cpu_limit is not none %}
+            cpu: "{{openshift_prometheus_alertbuffer_cpu_limit}}"
+{% endif %}
+        volumeMounts:
+        - mountPath: /alert-buffer
+          name: alert-buffer-data
+        ports:
+        - containerPort: 9099
+          name: alert-buf
+
+      - name: alertmanager
+        args:
+        - -config.file=/etc/alertmanager/alertmanager.yml
+        image: "{{ openshift_prometheus_image_alertmanager }}"
+        imagePullPolicy: IfNotPresent
+        resources:
+          requests:
+{% if openshift_prometheus_alertmanager_memory_requests is defined and openshift_prometheus_alertmanager_memory_requests is not none %}
+            memory: "{{openshift_prometheus_alertmanager_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_alertmanager_cpu_requests is defined and openshift_prometheus_alertmanager_cpu_requests is not none %}
+            cpu: "{{openshift_prometheus_alertmanager_cpu_requests}}"
+{% endif %}
+          limits:
+{% if openshift_prometheus_alertmanager_memory_limit is defined and openshift_prometheus_alertmanager_memory_limit is not none %}
+            memory: "{{openshift_prometheus_alertmanager_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_alertmanager_cpu_limit is defined and openshift_prometheus_alertmanager_cpu_limit is not none %}
+            cpu: "{{openshift_prometheus_alertmanager_cpu_limit}}"
+{% endif %}
+        ports:
+        - containerPort: 9093
+          name: web
+        volumeMounts:
+        - mountPath: /etc/alertmanager
+          name: alertmanager-config
+        - mountPath: /alertmanager
+          name: alertmanager-data
+
+      restartPolicy: Always
+      volumes:
+      - name: prometheus-config
+        configMap:
+          defaultMode: 420
+          name: prometheus
+      - name: prometheus-secrets
+        secret:
+          secretName: prometheus-proxy
+      - name: prometheus-tls
+        secret:
+          secretName: prometheus-tls
+      - name: prometheus-data
+{% if openshift_prometheus_storage_type == 'pvc' %}
+        persistentVolumeClaim:
+          claimName: {{ openshift_prometheus_pvc_name }}
+{% else %}
+        emptydir: {}
+{% endif %}
+      - name: alertmanager-config
+        configMap:
+          defaultMode: 420
+          name: prometheus-alerts
+      - name: alerts-secrets
+        secret:
+          secretName: alerts-proxy
+      - name: alerts-tls
+        secret:
+          secretName: prometheus-alerts-tls
+      - name: alertmanager-data
+{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
+        persistentVolumeClaim:
+          claimName: {{ openshift_prometheus_alertmanager_pvc_name }}
+{% else %}
+        emptydir: {}
+{% endif %}
+      - name: alert-buffer-data
+{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
+        persistentVolumeClaim:
+          claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}
+{% else %}
+        emptydir: {}
+{% endif %}

+ 2 - 0
roles/openshift_prometheus/tests/inventory

@@ -0,0 +1,2 @@
+localhost
+

+ 5 - 0
roles/openshift_prometheus/tests/test.yaml

@@ -0,0 +1,5 @@
+---
+- hosts: localhost
+  remote_user: root
+  roles:
+    - openshift_prometheus