template_openshift_master.yml 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. ---
  2. g_template_openshift_master:
  3. name: Template Openshift Master
  4. zitems:
  5. - name: create_app
  6. applications:
  7. - Openshift Master
  8. key: create_app
  9. - key: openshift.master.registry.healthz
  10. description: "Shows the health status of the cluster's docker registry"
  11. type: int
  12. applications:
  13. - Openshift Master
  14. - key: openshift.master.process.count
  15. description: Shows number of master processes running
  16. type: int
  17. applications:
  18. - Openshift Master
  19. - key: openshift.master.api.ping
  20. description: "Verify that the Openshift API is up"
  21. type: int
  22. applications:
  23. - Openshift Master
  24. - key: openshift.master.api.healthz
  25. description: "Checks the healthz check of the master's api: https://master_host/healthz"
  26. type: int
  27. data_type: bool
  28. applications:
  29. - Openshift Master
  30. - key: openshift.master.user.count
  31. description: Shows number of users in a cluster
  32. type: int
  33. applications:
  34. - Openshift Master
  35. - key: openshift.master.pod.running.count
  36. description: Shows number of pods running
  37. type: int
  38. applications:
  39. - Openshift Master
  40. - key: openshift.master.pod.user.running.count
  41. description: Shows number of user pods running (non infrastructure pods)
  42. type: int
  43. applications:
  44. - Openshift Master
  45. - key: openshift.master.pod.total.count
  46. description: Shows total number of pods (running and non running)
  47. type: int
  48. applications:
  49. - Openshift Master
  50. - key: openshift.master.node.count
  51. description: Shows the total number of nodes found in the Openshift Cluster
  52. type: int
  53. applications:
  54. - Openshift Master
  55. - key: openshift.project.count
  56. description: Shows number of projects on a cluster
  57. type: int
  58. applications:
  59. - Openshift Master
  60. - key: openshift.master.pv.total.count
  61. description: Total number of Persistent Volumes in the Openshift Cluster
  62. type: int
  63. applications:
  64. - Openshift Master
  65. - key: openshift.master.pv.available.count
  66. description: Total number of Available Persistent Volumes in the Openshift Cluster
  67. type: int
  68. applications:
  69. - Openshift Master
  70. - key: openshift.master.pv.released.count
  71. description: Total number of Released Persistent Volumes in the Openshift Cluster
  72. type: int
  73. applications:
  74. - Openshift Master
  75. - key: openshift.master.pv.bound.count
  76. description: Total number of Bound Persistent Volumes in the Openshift Cluster
  77. type: int
  78. applications:
  79. - Openshift Master
  80. - key: openshift.master.pv.failed.count
  81. description: Total number of Failed Persistent Volumes in the Openshift Cluster
  82. type: int
  83. applications:
  84. - Openshift Master
  85. - key: openshift.master.skydns.port.open
  86. description: State of the SkyDNS port open and listening
  87. type: int
  88. applications:
  89. - Openshift Master
  90. - key: openshift.master.skydns.query
  91. description: SkyDNS can be queried or not
  92. type: int
  93. applications:
  94. - Openshift Master
  95. - key: openshift.master.etcd.create.success
  96. description: Show number of successful create actions
  97. type: int
  98. applications:
  99. - Openshift Etcd
  100. - key: openshift.master.etcd.create.fail
  101. description: Show number of failed create actions
  102. type: int
  103. applications:
  104. - Openshift Etcd
  105. - key: openshift.master.etcd.delete.success
  106. description: Show number of successful delete actions
  107. type: int
  108. applications:
  109. - Openshift Etcd
  110. - key: openshift.master.etcd.delete.fail
  111. description: Show number of failed delete actions
  112. type: int
  113. applications:
  114. - Openshift Etcd
  115. - key: openshift.master.etcd.get.success
  116. description: Show number of successful get actions
  117. type: int
  118. applications:
  119. - Openshift Etcd
  120. - key: openshift.master.etcd.get.fail
  121. description: Show number of failed get actions
  122. type: int
  123. applications:
  124. - Openshift Etcd
  125. - key: openshift.master.etcd.set.success
  126. description: Show number of successful set actions
  127. type: int
  128. applications:
  129. - Openshift Etcd
  130. - key: openshift.master.etcd.set.fail
  131. description: Show number of failed set actions
  132. type: int
  133. applications:
  134. - Openshift Etcd
  135. - key: openshift.master.etcd.update.success
  136. description: Show number of successful update actions
  137. type: int
  138. applications:
  139. - Openshift Etcd
  140. - key: openshift.master.etcd.update.fail
  141. description: Show number of failed update actions
  142. type: int
  143. applications:
  144. - Openshift Etcd
  145. - key: openshift.master.etcd.watchers
  146. description: Show number of etcd watchers
  147. type: int
  148. applications:
  149. - Openshift Etcd
  150. - key: openshift.master.etcd.ping
  151. description: etcd ping
  152. type: int
  153. applications:
  154. - Openshift Etcd
  155. - key: openshift.master.metric.ping
  156. description: "This check verifies that the https://master/metrics check is alive and communicating properly."
  157. type: int
  158. applications:
  159. - Openshift Master Metrics
  160. - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
  161. description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
  162. type: int
  163. applications:
  164. - Openshift Master Metrics
  165. - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9
  166. description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
  167. type: int
  168. applications:
  169. - Openshift Master Metrics
  170. - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99
  171. description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
  172. type: int
  173. applications:
  174. - Openshift Master Metrics
  175. - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
  176. description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
  177. type: int
  178. applications:
  179. - Openshift Master Metrics
  180. - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
  181. description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
  182. type: int
  183. applications:
  184. - Openshift Master Metrics
  185. - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
  186. description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
  187. type: int
  188. applications:
  189. - Openshift Master Metrics
  190. - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
  191. description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed."
  192. type: int
  193. applications:
  194. - Openshift Master Metrics
  195. - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
  196. description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed."
  197. type: int
  198. applications:
  199. - Openshift Master Metrics
  200. - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
  201. description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed."
  202. type: int
  203. applications:
  204. - Openshift Master Metrics
  205. ztriggers:
  206. - name: 'Openshift Master process not running on {HOST.NAME}'
  207. expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
  208. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  209. priority: high
  210. - name: 'Too many Openshift Master processes running on {HOST.NAME}'
  211. expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
  212. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  213. priority: high
  214. - name: 'Low number of etcd watchers on {HOST.NAME}'
  215. expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
  216. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
  217. priority: avg
  218. - name: 'Etcd ping failed on {HOST.NAME}'
  219. expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
  220. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
  221. priority: high
  222. - name: 'Number of users for Openshift Master on {HOST.NAME}'
  223. expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
  224. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  225. priority: info
  226. - name: 'There are no projects running on {HOST.NAME}'
  227. expression: '{Template Openshift Master:openshift.project.count.last()}=0'
  228. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  229. priority: info
  230. # Put triggers that depend on other triggers here (deps must be created first)
  231. - name: 'Application creation has failed on {HOST.NAME}'
  232. expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
  233. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  234. dependencies:
  235. - 'Openshift Master process not running on {HOST.NAME}'
  236. priority: avg
  237. - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
  238. expression: '{Template Openshift Master:create_app.sum(1h)}>3'
  239. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  240. dependencies:
  241. - 'Openshift Master process not running on {HOST.NAME}'
  242. description: The application create loop has failed 4 or more times in the last hour
  243. priority: avg
  244. - name: 'Openshift Master API health check is failing on {HOST.NAME}'
  245. expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
  246. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  247. dependencies:
  248. - 'Openshift Master process not running on {HOST.NAME}'
  249. priority: high
  250. - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
  251. expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
  252. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  253. dependencies:
  254. - 'Openshift Master process not running on {HOST.NAME}'
  255. priority: high
  256. - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
  257. expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
  258. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  259. dependencies:
  260. - 'Openshift Master process not running on {HOST.NAME}'
  261. priority: avg
  262. - name: 'Docker Registry check failed on {HOST.NAME}'
  263. expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1'
  264. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  265. dependencies:
  266. - 'Openshift Master process not running on {HOST.NAME}'
  267. priority: high
  268. - name: 'SkyDNS port not listening on {HOST.NAME}'
  269. expression: '{Template Openshift Master:openshift.master.skydns.port.open(#3)}<1'
  270. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  271. dependencies:
  272. - 'Openshift Master process not running on {HOST.NAME}'
  273. priority: high
  274. - name: 'SkyDNS query failed on {HOST.NAME}'
  275. expression: '{Template Openshift Master:openshift.master.skydns.query(#3)}<1'
  276. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  277. dependencies:
  278. - 'Openshift Master API health check is failing on {HOST.NAME}'
  279. priority: high
  280. zgraphs:
  281. - name: Openshift Master API Server Latency Pods LIST Quantiles
  282. width: 900
  283. height: 200
  284. graph_items:
  285. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5
  286. color: red
  287. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9
  288. color: blue
  289. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99
  290. color: orange
  291. - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles
  292. width: 900
  293. height: 200
  294. graph_items:
  295. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
  296. color: red
  297. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
  298. color: blue
  299. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
  300. color: orange
  301. - name: Openshift Master Scheduler End to End Latency Quantiles
  302. width: 900
  303. height: 200
  304. graph_items:
  305. - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
  306. color: red
  307. - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
  308. color: blue
  309. - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
  310. color: orange