Bläddra i källkod

Merge pull request #1021 from mwoodson/master_checks

added pv checks; added trigger dependencies
Matt Woodson 9 år sedan
förälder
incheckning
ab93a71580

+ 67 - 26
roles/os_zabbix/vars/template_openshift_master.yml

@@ -68,6 +68,36 @@ g_template_openshift_master:
     applications:
     - Openshift Master
 
+  - key: openshift.master.pv.total.count
+    description: Total number of Persistent Volumes in the Openshift Cluster
+    type: int
+    applications:
+    - Openshift Master
+
+  - key: openshift.master.pv.available.count
+    description: Total number of Available Persistent Volumes in the Openshift Cluster
+    type: int
+    applications:
+    - Openshift Master
+
+  - key: openshift.master.pv.released.count
+    description: Total number of Released Persistent Volumes in the Openshift Cluster
+    type: int
+    applications:
+    - Openshift Master
+
+  - key: openshift.master.pv.bound.count
+    description: Total number of Bound Persistent Volumes in the Openshift Cluster
+    type: int
+    applications:
+    - Openshift Master
+
+  - key: openshift.master.pv.failed.count
+    description: Total number of Failed Persistent Volumes in the Openshift Cluster
+    type: int
+    applications:
+    - Openshift Master
+
   - key: openshift.master.etcd.create.success
     description: Show number of successful create actions
     type: int
@@ -201,26 +231,6 @@ g_template_openshift_master:
     - Openshift Master Metrics
 
   ztriggers:
-  - name: 'Application creation has failed on {HOST.NAME}'
-    expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
-    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
-    priority: avg
-
-  - name: 'Openshift Master API health check is failing on {HOST.NAME}'
-    expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
-    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
-    priority: high
-
-  - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
-    expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
-    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
-    priority: high
-
-  - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
-    expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
-    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
-    priority: avg
-
   - name: 'Openshift Master process not running on {HOST.NAME}'
     expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
@@ -231,6 +241,16 @@ g_template_openshift_master:
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
     priority: high
 
+  - name: 'Low number of etcd watchers on {HOST.NAME}'
+    expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
+    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+    priority: avg
+
+  - name: 'Etcd ping failed on {HOST.NAME}'
+    expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
+    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+    priority: high
+
   - name: 'Number of users for Openshift Master on {HOST.NAME}'
     expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
@@ -241,19 +261,40 @@ g_template_openshift_master:
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
     priority: info
 
-  - name: 'Low number of etcd watchers on {HOST.NAME}'
-    expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
-    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+  # Put triggers that depend on other triggers here (deps must be created first)
+  - name: 'Application creation has failed on {HOST.NAME}'
+    expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
+    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+    dependencies:
+    - 'Openshift Master process not running on {HOST.NAME}'
     priority: avg
 
-  - name: 'Etcd ping failed on {HOST.NAME}'
-    expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
-    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
+  - name: 'Openshift Master API health check is failing on {HOST.NAME}'
+    expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
+    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+    dependencies:
+    - 'Openshift Master process not running on {HOST.NAME}'
+    priority: high
+
+  - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
+    expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
+    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+    dependencies:
+    - 'Openshift Master process not running on {HOST.NAME}'
     priority: high
 
+  - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
+    expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
+    url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+    dependencies:
+    - 'Openshift Master process not running on {HOST.NAME}'
+    priority: avg
+
   - name: 'Docker Registry check failed on {HOST.NAME}'
     expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+    dependencies:
+    - 'Openshift Master process not running on {HOST.NAME}'
     priority: high
 
   zgraphs:

+ 0 - 12
roles/os_zabbix/vars/template_os_linux.yml

@@ -304,15 +304,3 @@ g_template_os_linux:
     description: 'CPU is less than 10% idle'
     dependencies:
     - 'CPU idle less than 5% on {HOST.NAME}'
-
-  zgraphprototypes:
-  - name: Network Interface Usage
-    width: 1000
-    height: 400
-    graph_items:
-    - item_name: "Bytes per second IN on network interface {#OSO_NET_INTERFACE}"
-      item_type: prototype
-      color: red
-    - item_name: "Bytes per second OUT on network interface {#OSO_NET_INTERFACE}"
-      item_type: prototype
-      color: blue