template_os_linux.yml 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. ---
  2. g_template_os_linux:
  3. name: Template OS Linux
  4. zitems:
  5. - key: kernel.uname.sysname
  6. applications:
  7. - Kernel
  8. value_type: string
  9. - key: kernel.all.cpu.wait.total
  10. applications:
  11. - Kernel
  12. value_type: float
  13. units: '%'
  14. - key: kernel.all.cpu.irq.hard
  15. applications:
  16. - Kernel
  17. value_type: float
  18. units: '%'
  19. - key: kernel.all.cpu.idle
  20. applications:
  21. - Kernel
  22. value_type: float
  23. units: '%'
  24. - key: kernel.uname.distro
  25. applications:
  26. - Kernel
  27. value_type: string
  28. - key: kernel.uname.nodename
  29. applications:
  30. - Kernel
  31. value_type: string
  32. - key: kernel.all.cpu.irq.soft
  33. applications:
  34. - Kernel
  35. value_type: float
  36. units: '%'
  37. - key: kernel.all.load.15_minute
  38. applications:
  39. - Kernel
  40. value_type: float
  41. - key: kernel.all.cpu.sys
  42. applications:
  43. - Kernel
  44. value_type: float
  45. units: '%'
  46. - key: kernel.all.load.5_minute
  47. applications:
  48. - Kernel
  49. value_type: float
  50. - key: kernel.all.cpu.nice
  51. applications:
  52. - Kernel
  53. value_type: float
  54. units: '%'
  55. - key: kernel.all.load.1_minute
  56. applications:
  57. - Kernel
  58. value_type: float
  59. - key: kernel.uname.version
  60. applications:
  61. - Kernel
  62. value_type: string
  63. - key: kernel.all.uptime
  64. applications:
  65. - Kernel
  66. value_type: int
  67. - key: kernel.all.cpu.user
  68. applications:
  69. - Kernel
  70. value_type: float
  71. units: '%'
  72. - key: kernel.uname.machine
  73. applications:
  74. - Kernel
  75. value_type: string
  76. - key: hinv.ncpu
  77. applications:
  78. - Kernel
  79. value_type: int
  80. - key: kernel.all.cpu.steal
  81. applications:
  82. - Kernel
  83. value_type: float
  84. units: '%'
  85. - key: kernel.all.pswitch
  86. applications:
  87. - Kernel
  88. value_type: int
  89. - key: kernel.uname.release
  90. applications:
  91. - Kernel
  92. value_type: string
  93. - key: proc.nprocs
  94. applications:
  95. - Kernel
  96. value_type: int
  97. # Memory Items
  98. - key: mem.freemem
  99. applications:
  100. - Memory
  101. value_type: int
  102. description: "PCP: free system memory metric from /proc/meminfo"
  103. multiplier: 1024
  104. units: B
  105. - key: mem.util.bufmem
  106. applications:
  107. - Memory
  108. value_type: int
  109. description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
  110. multiplier: 1024
  111. units: B
  112. - key: swap.used
  113. applications:
  114. - Memory
  115. value_type: int
  116. description: "PCP: swap used metric from /proc/meminfo"
  117. multiplier: 1024
  118. units: B
  119. - key: swap.length
  120. applications:
  121. - Memory
  122. value_type: int
  123. description: "PCP: total swap available metric from /proc/meminfo"
  124. multiplier: 1024
  125. units: B
  126. - key: mem.physmem
  127. applications:
  128. - Memory
  129. value_type: int
  130. description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
  131. multiplier: 1024
  132. units: B
  133. - key: swap.free
  134. applications:
  135. - Memory
  136. value_type: int
  137. description: "PCP: swap free metric from /proc/meminfo"
  138. multiplier: 1024
  139. units: B
  140. - key: mem.util.available
  141. applications:
  142. - Memory
  143. value_type: int
  144. description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
  145. multiplier: 1024
  146. units: B
  147. - key: mem.util.used
  148. applications:
  149. - Memory
  150. value_type: int
  151. description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
  152. multiplier: 1024
  153. units: B
  154. - key: mem.util.cached
  155. applications:
  156. - Memory
  157. value_type: int
  158. description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
  159. multiplier: 1024
  160. units: B
  161. zdiscoveryrules:
  162. - name: disc.filesys
  163. key: disc.filesys
  164. lifetime: 1
  165. description: "Dynamically register the filesystems"
  166. zitemprototypes:
  167. - discoveryrule_key: disc.filesys
  168. name: "disc.filesys.full.{#OSO_FILESYS}"
  169. key: "disc.filesys.full[{#OSO_FILESYS}]"
  170. value_type: float
  171. description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
  172. applications:
  173. - Disk
  174. - discoveryrule_key: disc.filesys
  175. name: "Percentage of used inodes on {#OSO_FILESYS}"
  176. key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
  177. value_type: float
  178. description: "PCP derived value of percentage of used inodes on a filesystem."
  179. applications:
  180. - Disk
  181. ztriggerprototypes:
  182. - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
  183. expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
  184. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  185. priority: warn
  186. - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
  187. expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
  188. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  189. priority: high
  190. - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
  191. expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
  192. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  193. priority: warn
  194. - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
  195. expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
  196. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  197. priority: high
  198. ztriggers:
  199. - name: 'Too many TOTAL processes on {HOST.NAME}'
  200. expression: '{Template OS Linux:proc.nprocs.last()}>5000'
  201. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
  202. priority: warn
  203. - name: 'Lack of available memory on {HOST.NAME}'
  204. expression: '{Template OS Linux:mem.freemem.last()}<30720000'
  205. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
  206. priority: warn
  207. description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
  208. # CPU Utilization #
  209. - name: 'CPU idle less than 5% on {HOST.NAME}'
  210. expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
  211. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
  212. priority: high
  213. description: 'CPU is less than 5% idle'
  214. - name: 'CPU idle less than 10% on {HOST.NAME}'
  215. expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
  216. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
  217. priority: warn
  218. description: 'CPU is less than 10% idle'
  219. dependencies:
  220. - 'CPU idle less than 5% on {HOST.NAME}'