template_os_linux.yml 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. ---
  2. g_template_os_linux:
  3. name: Template OS Linux
  4. zitems:
  5. - key: kernel.uname.sysname
  6. applications:
  7. - Kernel
  8. value_type: string
  9. - key: kernel.all.cpu.wait.total
  10. applications:
  11. - Kernel
  12. value_type: float
  13. units: '%'
  14. - key: kernel.all.cpu.irq.hard
  15. applications:
  16. - Kernel
  17. value_type: float
  18. units: '%'
  19. - key: kernel.all.cpu.idle
  20. applications:
  21. - Kernel
  22. value_type: float
  23. units: '%'
  24. - key: kernel.uname.distro
  25. applications:
  26. - Kernel
  27. value_type: string
  28. - key: kernel.uname.nodename
  29. applications:
  30. - Kernel
  31. value_type: string
  32. - key: kernel.all.cpu.irq.soft
  33. applications:
  34. - Kernel
  35. value_type: float
  36. units: '%'
  37. - key: kernel.all.load.15_minute
  38. applications:
  39. - Kernel
  40. value_type: float
  41. - key: kernel.all.cpu.sys
  42. applications:
  43. - Kernel
  44. value_type: float
  45. units: '%'
  46. - key: kernel.all.load.5_minute
  47. applications:
  48. - Kernel
  49. value_type: float
  50. - key: kernel.all.cpu.nice
  51. applications:
  52. - Kernel
  53. value_type: float
  54. units: '%'
  55. - key: kernel.all.load.1_minute
  56. applications:
  57. - Kernel
  58. value_type: float
  59. - key: kernel.uname.version
  60. applications:
  61. - Kernel
  62. value_type: string
  63. - key: kernel.all.uptime
  64. applications:
  65. - Kernel
  66. value_type: int
  67. - key: kernel.all.cpu.user
  68. applications:
  69. - Kernel
  70. value_type: float
  71. units: '%'
  72. - key: kernel.uname.machine
  73. applications:
  74. - Kernel
  75. value_type: string
  76. - key: hinv.ncpu
  77. applications:
  78. - Kernel
  79. value_type: int
  80. - key: kernel.all.cpu.steal
  81. applications:
  82. - Kernel
  83. value_type: float
  84. units: '%'
  85. - key: kernel.all.pswitch
  86. applications:
  87. - Kernel
  88. value_type: int
  89. - key: kernel.uname.release
  90. applications:
  91. - Kernel
  92. value_type: string
  93. - key: proc.nprocs
  94. applications:
  95. - Kernel
  96. value_type: int
  97. # Memory Items
  98. - key: mem.freemem
  99. applications:
  100. - Memory
  101. value_type: int
  102. description: "PCP: free system memory metric from /proc/meminfo"
  103. multiplier: 1024
  104. units: B
  105. - key: mem.util.bufmem
  106. applications:
  107. - Memory
  108. value_type: int
  109. description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
  110. multiplier: 1024
  111. units: B
  112. - key: swap.used
  113. applications:
  114. - Memory
  115. value_type: int
  116. description: "PCP: swap used metric from /proc/meminfo"
  117. multiplier: 1024
  118. units: B
  119. - key: swap.length
  120. applications:
  121. - Memory
  122. value_type: int
  123. description: "PCP: total swap available metric from /proc/meminfo"
  124. multiplier: 1024
  125. units: B
  126. - key: mem.physmem
  127. applications:
  128. - Memory
  129. value_type: int
  130. description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
  131. multiplier: 1024
  132. units: B
  133. - key: swap.free
  134. applications:
  135. - Memory
  136. value_type: int
  137. description: "PCP: swap free metric from /proc/meminfo"
  138. multiplier: 1024
  139. units: B
  140. - key: mem.util.available
  141. applications:
  142. - Memory
  143. value_type: int
  144. description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
  145. multiplier: 1024
  146. units: B
  147. - key: mem.util.used
  148. applications:
  149. - Memory
  150. value_type: int
  151. description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
  152. multiplier: 1024
  153. units: B
  154. - key: mem.util.cached
  155. applications:
  156. - Memory
  157. value_type: int
  158. description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
  159. multiplier: 1024
  160. units: B
  161. zdiscoveryrules:
  162. - name: disc.filesys
  163. key: disc.filesys
  164. lifetime: 1
  165. description: "Dynamically register the filesystems"
  166. - name: disc.disk
  167. key: disc.disk
  168. lifetime: 1
  169. description: "Dynamically register disks on a node"
  170. zitemprototypes:
  171. - discoveryrule_key: disc.filesys
  172. name: "disc.filesys.full.{#OSO_FILESYS}"
  173. key: "disc.filesys.full[{#OSO_FILESYS}]"
  174. value_type: float
  175. description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
  176. applications:
  177. - Disk
  178. - discoveryrule_key: disc.filesys
  179. name: "Percentage of used inodes on {#OSO_FILESYS}"
  180. key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
  181. value_type: float
  182. description: "PCP derived value of percentage of used inodes on a filesystem."
  183. applications:
  184. - Disk
  185. - discoveryrule_key: disc.disk
  186. name: "TPS (IOPS) for disk {#OSO_DISK}"
  187. key: "disc.disk.tps[{#OSO_DISK}]"
  188. value_type: int
  189. description: "PCP disk.dev.totals metric measured over a period of time. This shows how many disk transactions per second the disk is using"
  190. applications:
  191. - Disk
  192. - discoveryrule_key: disc.disk
  193. name: "Percent Utilized for disk {#OSO_DISK}"
  194. key: "disc.disk.putil[{#OSO_DISK}]"
  195. value_type: float
  196. description: "PCP disk.dev.avactive metric measured over a period of time. This is the '%util' in the iostat command"
  197. applications:
  198. - Disk
  199. ztriggerprototypes:
  200. - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
  201. expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
  202. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  203. priority: warn
  204. - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
  205. expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
  206. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  207. priority: high
  208. - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
  209. expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
  210. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  211. priority: warn
  212. - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
  213. expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
  214. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
  215. priority: high
  216. ztriggers:
  217. - name: 'Too many TOTAL processes on {HOST.NAME}'
  218. expression: '{Template OS Linux:proc.nprocs.last()}>5000'
  219. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
  220. priority: warn
  221. - name: 'Lack of available memory on {HOST.NAME}'
  222. expression: '{Template OS Linux:mem.freemem.last()}<30720000'
  223. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
  224. priority: warn
  225. description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
  226. # CPU Utilization #
  227. - name: 'CPU idle less than 5% on {HOST.NAME}'
  228. expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5'
  229. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
  230. priority: average
  231. description: 'CPU is less than 5% idle'
  232. - name: 'CPU idle less than 10% on {HOST.NAME}'
  233. expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10'
  234. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
  235. priority: average
  236. description: 'CPU is less than 10% idle'
  237. dependencies:
  238. - 'CPU idle less than 5% on {HOST.NAME}'