Browse Source

Merge pull request #6811 from mjudeikis/prometheus-new-template

Automatic merge from submit-queue.

Prometheus new template rebase

Updating Prometheus for new templates/example.

1. New scraping rules, fixes
2. exposed alert manager
3. clean ansible
4. add a custom annotation for routes (in the example when AVI router in use we need to be able to add custom annotations)
5. Externalise some of the configs

Still work in progress...
FIY: @zgalor
OpenShift Merge Robot 7 years ago
parent
commit
2ec70a36f5

+ 8 - 0
playbooks/openshift-prometheus/private/uninstall.yml

@@ -0,0 +1,8 @@
+---
+- name: Uninstall Prometheus
+  hosts: masters[0]
+  tasks:
+  - name: Run the Prometheus Uninstall Role Tasks
+    include_role:
+      name: openshift_prometheus
+      tasks_from: uninstall

+ 2 - 0
playbooks/openshift-prometheus/uninstall.yml

@@ -0,0 +1,2 @@
+---
+- import_playbook: private/uninstall.yml

+ 15 - 0
roles/openshift_prometheus/defaults/main.yaml

@@ -7,9 +7,24 @@ openshift_prometheus_namespace: openshift-metrics
 # defaults hosts for routes
 # defaults hosts for routes
 openshift_prometheus_hostname: prometheus-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
 openshift_prometheus_hostname: prometheus-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
 openshift_prometheus_alerts_hostname: alerts-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
 openshift_prometheus_alerts_hostname: alerts-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
+openshift_prometheus_alertmanager_hostname: alertmanager-{{openshift_prometheus_namespace}}.{{openshift_master_default_subdomain}}
+
 
 
 openshift_prometheus_node_selector: {"region":"infra"}
 openshift_prometheus_node_selector: {"region":"infra"}
 
 
+openshift_prometheus_service_port: 443
+openshift_prometheus_service_targetport: 8443
+openshift_prometheus_service_name: prometheus
+openshift_prometheus_alerts_service_targetport: 9443
+openshift_prometheus_alerts_service_name: alerts
+openshift_prometheus_alertmanager_service_targetport: 10443
+openshift_prometheus_alertmanager_service_name: alertmanager
+openshift_prometheus_serviceaccount_annotations: []
+l_openshift_prometheus_serviceaccount_annotations:
+  - serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+  - serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+  - serviceaccounts.openshift.io/oauth-redirectreference.alertmanager='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alertmanager"}}'
+
 # additional prometheus rules file
 # additional prometheus rules file
 openshift_prometheus_additional_rules_file: null
 openshift_prometheus_additional_rules_file: null
 
 

+ 10 - 0
roles/openshift_prometheus/tasks/facts.yaml

@@ -0,0 +1,10 @@
+---
+# The kubernetes version impacts the prometheus scraping endpoint
+# so gathering it before constructing the configmap
+- name: get oc version
+  oc_version:
+  register: oc_version
+
+- set_fact:
+    kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
+    openshift_prometheus_serviceaccount_annotations: "{{ l_openshift_prometheus_serviceaccount_annotations + openshift_prometheus_serviceaccount_annotations|list }}"

+ 53 - 66
roles/openshift_prometheus/tasks/install_prometheus.yaml

@@ -1,4 +1,6 @@
 ---
 ---
+# set facts
+- include_tasks: facts.yaml
 
 
 # namespace
 # namespace
 - name: Add prometheus project
 - name: Add prometheus project
@@ -9,7 +11,7 @@
     description: Prometheus
     description: Prometheus
 
 
 # secrets
 # secrets
-- name: Set alert and prometheus secrets
+- name: Set alert, alertmanager and prometheus secrets
   oc_secret:
   oc_secret:
     state: present
     state: present
     name: "{{ item }}-proxy"
     name: "{{ item }}-proxy"
@@ -20,30 +22,24 @@
   with_items:
   with_items:
     - prometheus
     - prometheus
     - alerts
     - alerts
+    - alertmanager
 
 
 # serviceaccount
 # serviceaccount
 - name: create prometheus serviceaccount
 - name: create prometheus serviceaccount
   oc_serviceaccount:
   oc_serviceaccount:
     state: present
     state: present
-    name: prometheus
+    name: "{{ openshift_prometheus_service_name }}"
     namespace: "{{ openshift_prometheus_namespace }}"
     namespace: "{{ openshift_prometheus_namespace }}"
-    #    TODO add annotations when supproted
-    #    annotations:
-    #      serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
-    #      serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
-
-    secrets:
-      - prometheus-secrets
   changed_when: no
   changed_when: no
 
 
+
 # TODO remove this when annotations are supported by oc_serviceaccount
 # TODO remove this when annotations are supported by oc_serviceaccount
 - name: annotate serviceaccount
 - name: annotate serviceaccount
   command: >
   command: >
     {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
     {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
-    serviceaccount prometheus
-    serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
-    serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
-
+    serviceaccount {{ openshift_prometheus_service_name }} {{ item }}
+  with_items:
+    "{{ openshift_prometheus_serviceaccount_annotations }}"
 
 
 # create clusterrolebinding for prometheus serviceaccount
 # create clusterrolebinding for prometheus serviceaccount
 - name: Set cluster-reader permissions for prometheus
 - name: Set cluster-reader permissions for prometheus
@@ -52,63 +48,61 @@
     namespace: "{{ openshift_prometheus_namespace }}"
     namespace: "{{ openshift_prometheus_namespace }}"
     resource_kind: cluster-role
     resource_kind: cluster-role
     resource_name: cluster-reader
     resource_name: cluster-reader
-    user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus"
+    user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:{{ openshift_prometheus_service_name }}"
+
 
 
-# create prometheus and alerts services
-# TODO join into 1 task with loop
-- name: Create prometheus service
+- name: create services for prometheus
   oc_service:
   oc_service:
-    state: present
-    name: "{{ item.name }}"
+    name: "{{ openshift_prometheus_service_name }}"
     namespace: "{{ openshift_prometheus_namespace }}"
     namespace: "{{ openshift_prometheus_namespace }}"
-    selector:
-      app: prometheus
     labels:
     labels:
-      name: "{{ item.name }}"
-      #    TODO add annotations when supported
-      #    annotations:
-      #      service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+      name: prometheus
+    annotations:
+      oprometheus.io/scrape: 'true'
+      oprometheus.io/scheme: https
+      service.alpha.openshift.io/serving-cert-secret-name: prometheus-tls
     ports:
     ports:
-      - port: 443
-        targetPort: 8443
-  with_items:
-    - name: prometheus
+      - name: prometheus
+        port: "{{ openshift_prometheus_service_port }}"
+        targetPort: "{{ openshift_prometheus_service_targetport }}"
+        protocol: TCP
+    selector:
+      app: prometheus
 
 
-- name: Create alerts service
+- name: create services for alert buffer
   oc_service:
   oc_service:
-    state: present
-    name: "{{ item.name }}"
+    name: "{{ openshift_prometheus_alerts_service_name }}"
     namespace: "{{ openshift_prometheus_namespace }}"
     namespace: "{{ openshift_prometheus_namespace }}"
+    labels:
+      name: prometheus
+    annotations:
+      service.alpha.openshift.io/serving-cert-secret-name: alerts-tls
+    ports:
+      - name: prometheus
+        port: "{{ openshift_prometheus_service_port }}"
+        targetPort: "{{ openshift_prometheus_alerts_service_targetport }}"
+        protocol: TCP
     selector:
     selector:
       app: prometheus
       app: prometheus
+
+- name: create services for alertmanager
+  oc_service:
+    name: "{{ openshift_prometheus_alertmanager_service_name }}"
+    namespace: "{{ openshift_prometheus_namespace }}"
     labels:
     labels:
-      name: "{{ item.name }}"
-      #    TODO add annotations when supported
-      #    annotations:
-      #      service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+      name: prometheus
+    annotations:
+      service.alpha.openshift.io/serving-cert-secret-name: alertmanager-tls
     ports:
     ports:
-      - port: 443
-        targetPort: 9443
-  with_items:
-    - name: alerts
-
-
-# Annotate services with secret name
-# TODO remove this when annotations are supported by oc_service
-- name: annotate prometheus service
-  command: >
-    {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
-    service prometheus
-    prometheus.io/scrape='true'
-    prometheus.io/scheme=https
-    service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls
-
-- name: annotate alerts service
-  command: >
-    {{ openshift_client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
-    service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls'
+      - name: prometheus
+        port: "{{ openshift_prometheus_service_port }}"
+        targetPort: "{{ openshift_prometheus_alertmanager_service_targetport }}"
+        protocol: TCP
+    selector:
+      app: prometheus
 
 
 # create prometheus and alerts routes
 # create prometheus and alerts routes
+# TODO: oc_route module should support insecureEdgeTerminationPolicy: Redirect
 - name: create prometheus and alerts routes
 - name: create prometheus and alerts routes
   oc_route:
   oc_route:
     state: present
     state: present
@@ -122,6 +116,8 @@
       host: "{{ openshift_prometheus_hostname }}"
       host: "{{ openshift_prometheus_hostname }}"
     - name: alerts
     - name: alerts
       host: "{{ openshift_prometheus_alerts_hostname }}"
       host: "{{ openshift_prometheus_alerts_hostname }}"
+    - name: alertmanager
+      host: "{{ openshift_prometheus_alertmanager_hostname }}"
 
 
 # Storage
 # Storage
 - name: create prometheus pvc
 - name: create prometheus pvc
@@ -169,15 +165,6 @@
     path: "{{ tempdir }}/prometheus.additional.rules"
     path: "{{ tempdir }}/prometheus.additional.rules"
   register: additional_rules_stat
   register: additional_rules_stat
 
 
-# The kubernetes version impacts the prometheus scraping endpoint
-# so gathering it before constructing the configmap
-- name: get oc version
-  oc_version:
-  register: oc_version
-
-- set_fact:
-    kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
-
 - template:
 - template:
     src: prometheus.yml.j2
     src: prometheus.yml.j2
     dest: "{{ tempdir }}/prometheus.yml"
     dest: "{{ tempdir }}/prometheus.yml"
@@ -219,7 +206,7 @@
 - name: Set alertmanager configmap
 - name: Set alertmanager configmap
   oc_configmap:
   oc_configmap:
     state: present
     state: present
-    name: "prometheus-alerts"
+    name: "alertmanager"
     namespace: "{{ openshift_prometheus_namespace }}"
     namespace: "{{ openshift_prometheus_namespace }}"
     from_file:
     from_file:
       alertmanager.yml: "{{ tempdir }}/alertmanager.yml"
       alertmanager.yml: "{{ tempdir }}/alertmanager.yml"

+ 3 - 1
roles/openshift_prometheus/tasks/main.yaml

@@ -16,9 +16,11 @@
 - name: Create templates subdirectory
 - name: Create templates subdirectory
   file:
   file:
     state: directory
     state: directory
-    path: "{{ tempdir }}/templates"
+    path: "{{ tempdir }}/{{ item }}"
     mode: 0755
     mode: 0755
   changed_when: False
   changed_when: False
+  with_items:
+    - templates
 
 
 - include_tasks: install_prometheus.yaml
 - include_tasks: install_prometheus.yaml
   when: openshift_prometheus_state == 'present'
   when: openshift_prometheus_state == 'present'

roles/openshift_prometheus/tasks/uninstall_prometheus.yaml → roles/openshift_prometheus/tasks/uninstall.yaml


+ 71 - 21
roles/openshift_prometheus/templates/prometheus.j2

@@ -19,7 +19,7 @@ spec:
       labels:
       labels:
         app: prometheus
         app: prometheus
     spec:
     spec:
-      serviceAccountName: prometheus
+      serviceAccountName: "{{ openshift_prometheus_service_name }}"
 {% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
 {% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
       nodeSelector:
       nodeSelector:
 {% for key, value in openshift_prometheus_node_selector.items() %}
 {% for key, value in openshift_prometheus_node_selector.items() %}
@@ -47,15 +47,15 @@ spec:
             cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
             cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
 {% endif %}
 {% endif %}
         ports:
         ports:
-        - containerPort: 8443
+        - containerPort: {{ openshift_prometheus_service_targetport }}
           name: web
           name: web
         args:
         args:
         - -provider=openshift
         - -provider=openshift
-        - -https-address=:8443
+        - -https-address=:{{ openshift_prometheus_service_targetport }}
         - -http-address=
         - -http-address=
         - -email-domain=*
         - -email-domain=*
         - -upstream=http://localhost:9090
         - -upstream=http://localhost:9090
-        - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+        - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
         - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
         - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
         - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
         - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
         - -tls-cert=/etc/tls/private/tls.crt
         - -tls-cert=/etc/tls/private/tls.crt
@@ -67,9 +67,9 @@ spec:
         - -skip-auth-regex=^/metrics
         - -skip-auth-regex=^/metrics
         volumeMounts:
         volumeMounts:
         - mountPath: /etc/tls/private
         - mountPath: /etc/tls/private
-          name: prometheus-tls
+          name: prometheus-tls-secret
         - mountPath: /etc/proxy/secrets
         - mountPath: /etc/proxy/secrets
-          name: prometheus-secrets
+          name: prometheus-proxy-secret
         - mountPath: /prometheus
         - mountPath: /prometheus
           name: prometheus-data
           name: prometheus-data
 
 
@@ -104,7 +104,7 @@ spec:
         - mountPath: /prometheus
         - mountPath: /prometheus
           name: prometheus-data
           name: prometheus-data
 
 
-      # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy
+      # Deploy alert-buffer behind oauth alerts-proxy
       - name: alerts-proxy
       - name: alerts-proxy
         image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
         image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
         imagePullPolicy: IfNotPresent
         imagePullPolicy: IfNotPresent
@@ -124,15 +124,15 @@ spec:
             cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
             cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
 {% endif %}
 {% endif %}
         ports:
         ports:
-        - containerPort: 9443
+        - containerPort: {{ openshift_prometheus_alerts_service_targetport }}
           name: web
           name: web
         args:
         args:
         - -provider=openshift
         - -provider=openshift
-        - -https-address=:9443
+        - -https-address=:{{ openshift_prometheus_alerts_service_targetport }}
         - -http-address=
         - -http-address=
         - -email-domain=*
         - -email-domain=*
         - -upstream=http://localhost:9099
         - -upstream=http://localhost:9099
-        - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+        - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
         - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
         - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
         - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
         - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
         - -tls-cert=/etc/tls/private/tls.crt
         - -tls-cert=/etc/tls/private/tls.crt
@@ -143,9 +143,9 @@ spec:
         - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
         - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
         volumeMounts:
         volumeMounts:
         - mountPath: /etc/tls/private
         - mountPath: /etc/tls/private
-          name: alerts-tls
+          name: alerts-tls-secret
         - mountPath: /etc/proxy/secrets
         - mountPath: /etc/proxy/secrets
-          name: alerts-secrets
+          name: alerts-proxy-secret
 
 
       - name: alert-buffer
       - name: alert-buffer
         args:
         args:
@@ -169,11 +169,54 @@ spec:
 {% endif %}
 {% endif %}
         volumeMounts:
         volumeMounts:
         - mountPath: /alert-buffer
         - mountPath: /alert-buffer
-          name: alert-buffer-data
+          name: alerts-data
         ports:
         ports:
         - containerPort: 9099
         - containerPort: 9099
           name: alert-buf
           name: alert-buf
 
 
+      # Deploy alertmanager behind oauth alertmanager-proxy
+      - name: alertmanager-proxy
+        image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
+        imagePullPolicy: IfNotPresent
+        requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+          memory: "{{ openshift_prometheus_oauth_proxy_memory_requests }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+          cpu: "{{ openshift_prometheus_oauth_proxy_cpu_requests }}"
+{% endif %}
+        limits:
+{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+          memory: "{{ openshift_prometheus_oauth_proxy_memory_limit }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+          cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
+{% endif %}
+        ports:
+        - containerPort: {{ openshift_prometheus_alertmanager_service_targetport }}
+          name: web
+        args:
+        - -provider=openshift
+        - -https-address=:{{ openshift_prometheus_alertmanager_service_targetport }}
+        - -http-address=
+        - -email-domain=*
+        - -upstream=http://localhost:9093
+        - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
+        - -openshift-ca=/etc/pki/tls/cert.pem
+        - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+        - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+        - -tls-cert=/etc/tls/private/tls.crt
+        - -tls-key=/etc/tls/private/tls.key
+        - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+        - -cookie-secret-file=/etc/proxy/secrets/session_secret
+        - -skip-auth-regex=^/metrics
+        volumeMounts:
+        - mountPath: /etc/tls/private
+          name: alertmanager-tls-secret
+        - mountPath: /etc/proxy/secrets
+          name: alertmanager-proxy-secret
+
       - name: alertmanager
       - name: alertmanager
         args:
         args:
         - -config.file=/etc/alertmanager/alertmanager.yml
         - -config.file=/etc/alertmanager/alertmanager.yml
@@ -205,14 +248,15 @@ spec:
 
 
       restartPolicy: Always
       restartPolicy: Always
       volumes:
       volumes:
+      
       - name: prometheus-config
       - name: prometheus-config
         configMap:
         configMap:
           defaultMode: 420
           defaultMode: 420
           name: prometheus
           name: prometheus
-      - name: prometheus-secrets
+      - name: prometheus-proxy-secret
         secret:
         secret:
           secretName: prometheus-proxy
           secretName: prometheus-proxy
-      - name: prometheus-tls
+      - name: prometheus-tls-secret
         secret:
         secret:
           secretName: prometheus-tls
           secretName: prometheus-tls
       - name: prometheus-data
       - name: prometheus-data
@@ -225,13 +269,19 @@ spec:
       - name: alertmanager-config
       - name: alertmanager-config
         configMap:
         configMap:
           defaultMode: 420
           defaultMode: 420
-          name: prometheus-alerts
-      - name: alerts-secrets
+          name: alertmanager
+      - name: alertmanager-proxy-secret
         secret:
         secret:
-          secretName: alerts-proxy
-      - name: alerts-tls
+          secretName: alertmanager-proxy  
+      - name: alertmanager-tls-secret
+        secret:
+          secretName: alertmanager-tls 
+      - name: alerts-tls-secret
         secret:
         secret:
-          secretName: prometheus-alerts-tls
+          secretName: alerts-tls
+      - name: alerts-proxy-secret
+        secret:
+          secretName: alerts-proxy
       - name: alertmanager-data
       - name: alertmanager-data
 {% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
 {% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
         persistentVolumeClaim:
         persistentVolumeClaim:
@@ -239,7 +289,7 @@ spec:
 {% else %}
 {% else %}
         emptydir: {}
         emptydir: {}
 {% endif %}
 {% endif %}
-      - name: alert-buffer-data
+      - name: alerts-data
 {% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
 {% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
         persistentVolumeClaim:
         persistentVolumeClaim:
           claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}
           claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}

+ 121 - 54
roles/openshift_prometheus/templates/prometheus.yml.j2

@@ -1,10 +1,5 @@
 rule_files:
 rule_files:
-  - 'prometheus.rules'
-{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
-  - 'prometheus.additional.rules'
-{% endif %}
-
-
+  - '*.rules'
 
 
 # A scrape configuration for running Prometheus on a Kubernetes cluster.
 # A scrape configuration for running Prometheus on a Kubernetes cluster.
 # This uses separate scrape configs for cluster components (i.e. API server, node)
 # This uses separate scrape configs for cluster components (i.e. API server, node)
@@ -39,31 +34,11 @@ scrape_configs:
     action: keep
     action: keep
     regex: default;kubernetes;https
     regex: default;kubernetes;https
 
 
-# Scrape config for nodes.
-#
-# Each node exposes a /metrics endpoint that contains operational metrics for
-# the Kubelet and other components.
-- job_name: 'kubernetes-nodes'
-
-  scheme: https
-  tls_config:
-    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-  kubernetes_sd_configs:
-  - role: node
-
-  relabel_configs:
-  - action: labelmap
-    regex: __meta_kubernetes_node_label_(.+)
-
 # Scrape config for controllers.
 # Scrape config for controllers.
 #
 #
 # Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
 # Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
 # the controllers.
 # the controllers.
 #
 #
-# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
-#       endpoints.
 - job_name: 'kubernetes-controllers'
 - job_name: 'kubernetes-controllers'
 
 
   scheme: https
   scheme: https
@@ -87,6 +62,27 @@ scrape_configs:
     regex: (.+)(?::\d+)
     regex: (.+)(?::\d+)
     replacement: $1:8444
     replacement: $1:8444
 
 
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+  scheme: https
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+  kubernetes_sd_configs:
+  - role: node
+  # Drop a very high cardinality metric that is incorrect in 3.7. It will be
+  # fixed in 3.9.
+  metric_relabel_configs:
+  - source_labels: [__name__]
+    action: drop
+    regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
+  relabel_configs:
+  - action: labelmap
+    regex: __meta_kubernetes_node_label_(.+)
+
 # Scrape config for cAdvisor.
 # Scrape config for cAdvisor.
 #
 #
 # Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
 # Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
@@ -107,6 +103,14 @@ scrape_configs:
   kubernetes_sd_configs:
   kubernetes_sd_configs:
   - role: node
   - role: node
 
 
+  # Exclude a set of high cardinality metrics that can contribute to significant
+  # memory use in large clusters. These can be selectively enabled as necessary
+  # for medium or small clusters.
+  metric_relabel_configs:
+  - source_labels: [__name__]
+    action: drop
+    regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
+
   relabel_configs:
   relabel_configs:
   - action: labelmap
   - action: labelmap
     regex: __meta_kubernetes_node_label_(.+)
     regex: __meta_kubernetes_node_label_(.+)
@@ -133,38 +137,101 @@ scrape_configs:
   - role: endpoints
   - role: endpoints
 
 
   relabel_configs:
   relabel_configs:
-  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
-    action: keep
-    regex: true
-  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
-    action: replace
-    target_label: __scheme__
-    regex: (https?)
-  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+    # only scrape infrastructure components
+    - source_labels: [__meta_kubernetes_namespace]
+      action: keep
+      regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
+    # drop infrastructure components managed by other scrape targets
+    - source_labels: [__meta_kubernetes_service_name]
+      action: drop
+      regex: 'prometheus-node-exporter'
+    # only those that have requested scraping
+    - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+      action: keep
+      regex: true
+    - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+      action: replace
+      target_label: __scheme__
+      regex: (https?)
+    - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+      action: replace
+      target_label: __metrics_path__
+      regex: (.+)
+    - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+      action: replace
+      target_label: __address__
+      regex: (.+)(?::\d+);(\d+)
+      replacement: $1:$2
+    - action: labelmap
+      regex: __meta_kubernetes_service_label_(.+)
+    - source_labels: [__meta_kubernetes_namespace]
+      action: replace
+      target_label: kubernetes_namespace
+    - source_labels: [__meta_kubernetes_service_name]
+      action: replace
+      target_label: kubernetes_name
+
+# Scrape config for node-exporter, which is expected to be running on port 9100.
+- job_name: 'kubernetes-nodes-exporter'
+
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+
+  kubernetes_sd_configs:
+  - role: node
+
+  metric_relabel_configs:
+  - source_labels: [__name__]
+    action: drop
+    regex: 'node_cpu|node_(disk|scrape_collector)_.+'
+  # preserve a subset of the network, netstat, vmstat, and filesystem series
+  - source_labels: [__name__]
     action: replace
     action: replace
-    target_label: __metrics_path__
-    regex: (.+)
-  - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+    regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
+    target_label: __name__
+    replacement: renamed_$1
+  - source_labels: [__name__]
+    action: drop
+    regex: 'node_(netstat|vmstat|filesystem|network)_.+'
+  - source_labels: [__name__]
     action: replace
     action: replace
+    regex: 'renamed_(.+)'
+    target_label: __name__
+    replacement: $1
+  # drop any partial expensive series
+  - source_labels: [__name__, device]
+    action: drop
+    regex: 'node_network_.+;veth.+'
+  - source_labels: [__name__, mountpoint]
+    action: drop
+    regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'
+
+  relabel_configs:
+  - source_labels: [__address__]
+    regex: '(.*):10250'
+    replacement: '${1}:9100'
     target_label: __address__
     target_label: __address__
-    regex: (.+)(?::\d+);(\d+)
-    replacement: $1:$2
-  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
-    action: replace
-    target_label: __basic_auth_username__
-    regex: (.+)
-  - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
-    action: replace
-    target_label: __basic_auth_password__
-    regex: (.+)
+  - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
+    target_label: __instance__
   - action: labelmap
   - action: labelmap
-    regex: __meta_kubernetes_service_label_(.+)
-  - source_labels: [__meta_kubernetes_namespace]
-    action: replace
-    target_label: kubernetes_namespace
-  - source_labels: [__meta_kubernetes_service_name]
-    action: replace
-    target_label: kubernetes_name
+    regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for the template service broker
+- job_name: 'openshift-template-service-broker'
+  scheme: https
+  tls_config:
+    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
+    server_name: apiserver.openshift-template-service-broker.svc
+  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+  kubernetes_sd_configs:
+  - role: endpoints
+
+  relabel_configs:
+  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+    action: keep
+    regex: openshift-template-service-broker;apiserver;https
+
 
 
 alerting:
 alerting:
   alertmanagers:
   alertmanagers: