瀏覽代碼

updated triggers and items to have better descriptions and multipliers

Matt Woodson 9 年之前
父節點
當前提交
2f0bbb5781

+ 6 - 6
roles/os_zabbix/vars/template_docker.yml

@@ -52,35 +52,35 @@ g_template_docker:
     - Docker Storage
     value_type: float
   ztriggers:
-  - description: 'docker.ping failed on {HOST.NAME}'
+  - name: 'docker.ping failed on {HOST.NAME}'
     expression: '{Template Docker:docker.ping.max(#3)}<1'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
     priority: high
 
-  - description: 'Docker storage is using LOOPBACK on {HOST.NAME}'
+  - name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
     expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
     priority: high
 
-  - description: 'Critically low docker storage data space on {HOST.NAME}'
+  - name: 'Critically low docker storage data space on {HOST.NAME}'
     expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
     priority: high
 
-  - description: 'Critically low docker storage metadata space on {HOST.NAME}'
+  - name: 'Critically low docker storage metadata space on {HOST.NAME}'
     expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
     priority: high
 
   # Put triggers that depend on other triggers here (deps must be created first)
-  - description: 'Low docker storage data space on {HOST.NAME}'
+  - name: 'Low docker storage data space on {HOST.NAME}'
     expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
     dependencies:
     - 'Critically low docker storage data space on {HOST.NAME}'
     priority: average
 
-  - description: 'Low docker storage metadata space on {HOST.NAME}'
+  - name: 'Low docker storage metadata space on {HOST.NAME}'
     expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
     dependencies:

+ 1 - 1
roles/os_zabbix/vars/template_heartbeat.yml

@@ -7,7 +7,7 @@ g_template_heartbeat:
     - Heartbeat
     key: heartbeat.ping
   ztriggers:
-  - description: 'Heartbeat.ping has failed on {HOST.NAME}'
+  - name: 'Heartbeat.ping has failed on {HOST.NAME}'
     expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1'
     priority: avg
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'

+ 1 - 1
roles/os_zabbix/vars/template_openshift_master.yml

@@ -7,7 +7,7 @@ g_template_openshift_master:
     - Openshift Master
     key: create_app
   ztriggers:
-  - description: 'Application creation has failed on {HOST.NAME}'
+  - name: 'Application creation has failed on {HOST.NAME}'
     expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
     priority: avg

+ 73 - 49
roles/os_zabbix/vars/template_os_linux.yml

@@ -52,112 +52,135 @@ g_template_os_linux:
     - Kernel
     value_type: float
 
-  - key: mem.freemem
+  - key: kernel.all.cpu.nice
     applications:
-    - Memory
+    - Kernel
     value_type: int
 
-  - key: kernel.all.cpu.nice
+  - key: kernel.all.load.1_minute
     applications:
     - Kernel
-    value_type: int
+    value_type: float
 
-  - key: mem.util.bufmem
+  - key: kernel.uname.version
     applications:
-    - Memory
-    value_type: int
+    - Kernel
+    value_type: string
 
-  - key: swap.used
+  - key: kernel.all.uptime
     applications:
-    - Memory
+    - Kernel
     value_type: int
 
-  - key: kernel.all.load.1_minute
+  - key: kernel.all.cpu.user
     applications:
     - Kernel
-    value_type: float
+    value_type: int
 
-  - key: kernel.uname.version
+  - key: kernel.uname.machine
     applications:
     - Kernel
     value_type: string
 
-  - key: swap.length
+  - key: hinv.ncpu
     applications:
-    - Memory
+    - Kernel
     value_type: int
 
-  - key: mem.physmem
+  - key: kernel.all.cpu.steal
     applications:
-    - Memory
+    - Kernel
     value_type: int
 
-  - key: kernel.all.uptime
+  - key: kernel.all.pswitch
     applications:
     - Kernel
     value_type: int
 
-  - key: swap.free
+  - key: kernel.uname.release
     applications:
-    - Memory
-    value_type: int
+    - Kernel
+    value_type: string
 
-  - key: mem.util.available
+  - key: proc.nprocs
     applications:
-    - Memory
+    - Kernel
     value_type: int
 
-  - key: mem.util.used
+  # Memory Items
+  - key: mem.freemem
     applications:
     - Memory
     value_type: int
-    description: used memory
+    description: "PCP: free system memory metric from /proc/meminfo"
     multiplier: 1024
     units: B
 
-  - key: kernel.all.cpu.user
+  - key: mem.util.bufmem
     applications:
-    - Kernel
+    - Memory
     value_type: int
+    description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
-  - key: kernel.uname.machine
+  - key: swap.used
     applications:
-    - Kernel
-    value_type: string
+    - Memory
+    value_type: int
+    description: "PCP: swap used metric from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
-  - key: hinv.ncpu
+  - key: swap.length
     applications:
-    - Kernel
+    - Memory
     value_type: int
+    description: "PCP: total swap available metric from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
-  - key: mem.util.cached
+  - key: mem.physmem
     applications:
     - Memory
     value_type: int
-    description: cached memory
+    description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
     multiplier: 1024
     units: B
 
-  - key: kernel.all.cpu.steal
+  - key: swap.free
     applications:
-    - Kernel
+    - Memory
     value_type: int
+    description: "PCP: swap free metric from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
-  - key: kernel.all.pswitch
+  - key: mem.util.available
     applications:
-    - Kernel
+    - Memory
     value_type: int
+    description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
-  - key: kernel.uname.release
+  - key: mem.util.used
     applications:
-    - Kernel
-    value_type: string
+    - Memory
+    value_type: int
+    description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
-  - key: proc.nprocs
+  - key: mem.util.cached
     applications:
-    - Kernel
+    - Memory
     value_type: int
+    description: "PCP: Memory used by the page cache, including buffered file data.  This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
+    multiplier: 1024
+    units: B
 
+  # Disk items
   - key: filesys.full.xvda2
     applications:
     - Disk
@@ -169,32 +192,33 @@ g_template_os_linux:
     value_type: float
 
   ztriggers:
-  - description: 'Filesystem: / has less than 10% free on {HOST.NAME}'
+  - name: 'Filesystem: / has less than 10% free on {HOST.NAME}'
     expression: '{Template OS Linux:filesys.full.xvda2.last()}>90'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
     priority: warn
 
-  - description: 'Filesystem: / has less than 5% free on {HOST.NAME}'
+  - name: 'Filesystem: / has less than 5% free on {HOST.NAME}'
     expression: '{Template OS Linux:filesys.full.xvda2.last()}>95'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
     priority: high
 
-  - description: 'Filesystem: /var has less than 10% free on {HOST.NAME}'
+  - name: 'Filesystem: /var has less than 10% free on {HOST.NAME}'
     expression: '{Template OS Linux:filesys.full.xvda3.last()}>90'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
     priority: warn
 
-  - description: 'Filesystem: /var has less than 5% free on {HOST.NAME}'
+  - name: 'Filesystem: /var has less than 5% free on {HOST.NAME}'
     expression: '{Template OS Linux:filesys.full.xvda3.last()}>95'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
     priority: high
 
-  - description: 'Too many TOTAL processes on {HOST.NAME}'
+  - name: 'Too many TOTAL processes on {HOST.NAME}'
     expression: '{Template OS Linux:proc.nprocs.last()}>5000'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
     priority: warn
 
-  - description: 'Lack of available memory on {HOST.NAME}'
-    expression: '{Template OS Linux:mem.freemem.last()}<3000'
+  - name: 'Lack of available memory on {HOST.NAME}'
+    expression: '{Template OS Linux:mem.freemem.last()}<30720000'
     url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
     priority: warn
+    description: 'Alert on less than 30MegaBytes.  This is 30 Million Bytes.  30000 KB x 1024'