template_openshift_master.yml 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. ---
  2. g_template_openshift_master:
  3. name: Template Openshift Master
  4. zitems:
  5. - name: openshift.master.app.create
  6. applications:
  7. - Openshift Master
  8. key: openshift.master.app.create
  9. - key: openshift.master.app.build.create
  10. description: "check the app create with a build process"
  11. value_type: int
  12. applications:
  13. - Openshift Master
  14. - key: openshift.master.app.create.time
  15. description: "check the time it takes app create with a build process"
  16. value_type: float
  17. applications:
  18. - Openshift Master
  19. - key: openshift.master.app.build.time
  20. description: "check the time it takes app build"
  21. value_type: float
  22. applications:
  23. - Openshift Master
  24. - key: openshift.master.process.count
  25. description: Shows number of master processes running
  26. value_type: int
  27. applications:
  28. - Openshift Master
  29. - key: openshift.master.api.ping
  30. description: "Verify that the Openshift API is up (uses the cluster API URL)"
  31. value_type: int
  32. applications:
  33. - Openshift Master
  34. - key: openshift.master.local.api.ping
  35. description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)"
  36. value_type: int
  37. applications:
  38. - Openshift Master
  39. - key: openshift.master.api.healthz
  40. description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz"
  41. value_type: int
  42. data_type: bool
  43. applications:
  44. - Openshift Master
  45. - key: openshift.master.local.api.healthz
  46. description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz"
  47. value_type: int
  48. data_type: bool
  49. applications:
  50. - Openshift Master
  51. - key: openshift.master.user.count
  52. description: Shows number of users in a cluster
  53. value_type: int
  54. applications:
  55. - Openshift Master
  56. - key: openshift.master.pod.running.count
  57. description: Shows number of pods running
  58. value_type: int
  59. applications:
  60. - Openshift Master
  61. - key: openshift.master.pod.user.running.count
  62. description: Shows number of user pods running (non infrastructure pods)
  63. value_type: int
  64. applications:
  65. - Openshift Master
  66. - key: openshift.master.pod.total.count
  67. description: Shows total number of pods (running and non running)
  68. value_type: int
  69. applications:
  70. - Openshift Master
  71. - key: openshift.master.node.count
  72. description: Shows the total number of nodes found in the Openshift Cluster
  73. value_type: int
  74. applications:
  75. - Openshift Master
  76. - key: openshift.project.count
  77. description: Shows number of projects on a cluster
  78. value_type: int
  79. applications:
  80. - Openshift Master
  81. - key: openshift.master.pv.space.total
  82. description: Shows the total space of pv
  83. value_type: int
  84. applications:
  85. - Openshift Master
  86. - key: openshift.master.pv.space.available
  87. description: Shows the available space of pv
  88. value_type: int
  89. applications:
  90. - Openshift Master
  91. - key: openshift.master.pv.total.count
  92. description: Total number of Persistent Volumes in the Openshift Cluster
  93. value_type: int
  94. applications:
  95. - Openshift Master
  96. - key: openshift.master.pv.available.count
  97. description: Total number of Available Persistent Volumes in the Openshift Cluster
  98. value_type: int
  99. applications:
  100. - Openshift Master
  101. - key: openshift.master.pv.released.count
  102. description: Total number of Released Persistent Volumes in the Openshift Cluster
  103. value_type: int
  104. applications:
  105. - Openshift Master
  106. - key: openshift.master.pv.bound.count
  107. description: Total number of Bound Persistent Volumes in the Openshift Cluster
  108. value_type: int
  109. applications:
  110. - Openshift Master
  111. - key: openshift.master.pv.failed.count
  112. description: Total number of Failed Persistent Volumes in the Openshift Cluster
  113. value_type: int
  114. applications:
  115. - Openshift Master
  116. - key: openshift.master.skydns.port.open
  117. description: State of the SkyDNS port open and listening
  118. value_type: int
  119. applications:
  120. - Openshift Master
  121. - key: openshift.master.skydns.query
  122. description: SkyDNS can be queried or not
  123. value_type: int
  124. applications:
  125. - Openshift Master
  126. - key: openshift.master.etcd.create.success
  127. description: Show number of successful create actions
  128. value_type: int
  129. applications:
  130. - Openshift Etcd
  131. - key: openshift.master.etcd.create.fail
  132. description: Show number of failed create actions
  133. value_type: int
  134. applications:
  135. - Openshift Etcd
  136. - key: openshift.master.etcd.delete.success
  137. description: Show number of successful delete actions
  138. value_type: int
  139. applications:
  140. - Openshift Etcd
  141. - key: openshift.master.etcd.delete.fail
  142. description: Show number of failed delete actions
  143. value_type: int
  144. applications:
  145. - Openshift Etcd
  146. - key: openshift.master.etcd.get.success
  147. description: Show number of successful get actions
  148. value_type: int
  149. applications:
  150. - Openshift Etcd
  151. - key: openshift.master.etcd.get.fail
  152. description: Show number of failed get actions
  153. value_type: int
  154. applications:
  155. - Openshift Etcd
  156. - key: openshift.master.etcd.set.success
  157. description: Show number of successful set actions
  158. value_type: int
  159. applications:
  160. - Openshift Etcd
  161. - key: openshift.master.etcd.set.fail
  162. description: Show number of failed set actions
  163. value_type: int
  164. applications:
  165. - Openshift Etcd
  166. - key: openshift.master.etcd.update.success
  167. description: Show number of successful update actions
  168. value_type: int
  169. applications:
  170. - Openshift Etcd
  171. - key: openshift.master.etcd.update.fail
  172. description: Show number of failed update actions
  173. value_type: int
  174. applications:
  175. - Openshift Etcd
  176. - key: openshift.master.etcd.watchers
  177. description: Show number of etcd watchers
  178. value_type: int
  179. applications:
  180. - Openshift Etcd
  181. - key: openshift.master.etcd.ping
  182. description: etcd ping
  183. value_type: int
  184. applications:
  185. - Openshift Etcd
  186. - key: openshift.master.metric.ping
  187. description: "This check verifies that the https://master/metrics check is alive and communicating properly."
  188. value_type: int
  189. applications:
  190. - Openshift Master Metrics
  191. - key: openshift.master.nodesnotready.count
  192. description: "This check shows how many nodes in a cluster are in NotReady state."
  193. value_type: int
  194. applications:
  195. - Openshift Master
  196. - key: openshift.master.nodesnotschedulable.count
  197. description: "This check shows how many nodes in a cluster are not schedulable."
  198. value_type: int
  199. applications:
  200. - Openshift Master
  201. - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
  202. description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
  203. value_type: int
  204. applications:
  205. - Openshift Master Metrics
  206. - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9
  207. description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
  208. value_type: int
  209. applications:
  210. - Openshift Master Metrics
  211. - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99
  212. description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
  213. value_type: int
  214. applications:
  215. - Openshift Master Metrics
  216. - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
  217. description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
  218. value_type: int
  219. applications:
  220. - Openshift Master Metrics
  221. - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
  222. description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
  223. value_type: int
  224. applications:
  225. - Openshift Master Metrics
  226. - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
  227. description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
  228. value_type: int
  229. applications:
  230. - Openshift Master Metrics
  231. - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
  232. description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed."
  233. value_type: int
  234. applications:
  235. - Openshift Master Metrics
  236. - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
  237. description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed."
  238. value_type: int
  239. applications:
  240. - Openshift Master Metrics
  241. - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
  242. description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed."
  243. value_type: int
  244. applications:
  245. - Openshift Master Metrics
  246. zdiscoveryrules:
  247. - name: disc.pv
  248. key: disc.pv
  249. lifetime: 1
  250. description: "Dynamically register the Persistent Volumes"
  251. zitemprototypes:
  252. - discoveryrule_key: disc.pv
  253. name: "disc.pv.count.{#OSO_PV}"
  254. key: "disc.pv.count[{#OSO_PV}]"
  255. value_type: int
  256. description: "Number of PV's of this size"
  257. applications:
  258. - Openshift Master
  259. - discoveryrule_key: disc.pv
  260. name: "disc.pv.available.{#OSO_PV}"
  261. key: "disc.pv.available[{#OSO_PV}]"
  262. value_type: int
  263. description: "Number of PV's of this size that are available"
  264. applications:
  265. - Openshift Master
  266. ztriggers:
  267. - name: 'Openshift Master process not running on {HOST.NAME}'
  268. expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
  269. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  270. priority: high
  271. - name: 'Too many Openshift Master processes running on {HOST.NAME}'
  272. expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
  273. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  274. priority: high
  275. - name: 'Etcd ping failed on {HOST.NAME}'
  276. expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
  277. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
  278. priority: high
  279. - name: 'Number of users for Openshift Master on {HOST.NAME}'
  280. expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
  281. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  282. priority: info
  283. - name: 'There are no projects running on {HOST.NAME}'
  284. expression: '{Template Openshift Master:openshift.project.count.last()}=0'
  285. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  286. priority: info
  287. # Put triggers that depend on other triggers here (deps must be created first)
  288. - name: 'Application creation has failed on {HOST.NAME}'
  289. expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1'
  290. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  291. dependencies:
  292. - 'Openshift Master process not running on {HOST.NAME}'
  293. priority: avg
  294. - name: 'Application creation with build has failed on {HOST.NAME}'
  295. expression: '{Template Openshift Master:openshift.master.app.build.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.build.create.last(#2)}=1'
  296. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  297. dependencies:
  298. - 'Openshift Master process not running on {HOST.NAME}'
  299. priority: avg
  300. - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
  301. expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3'
  302. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  303. dependencies:
  304. - 'Openshift Master process not running on {HOST.NAME}'
  305. description: The application create loop has failed 4 or more times in the last hour
  306. priority: avg
  307. - name: 'Application with build creation has failed multiple times in the last 2 hour on {HOST.NAME}'
  308. expression: '{Template Openshift Master:openshift.master.app.build.create.sum(2h)}>3'
  309. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  310. dependencies:
  311. - 'Openshift Master process not running on {HOST.NAME}'
  312. description: The application create loop has failed 4 or more times in the last hour
  313. priority: avg
  314. - name: 'Openshift Master API health check is failing on {HOST.NAME}'
  315. expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
  316. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  317. priority: high
  318. - name: 'Openshift Master Local API health check is failing on {HOST.NAME}'
  319. expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1'
  320. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  321. dependencies:
  322. - 'Openshift Master process not running on {HOST.NAME}'
  323. priority: high
  324. - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
  325. expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
  326. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  327. priority: high
  328. - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}'
  329. expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1'
  330. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  331. dependencies:
  332. - 'Openshift Master process not running on {HOST.NAME}'
  333. priority: high
  334. - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
  335. expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
  336. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  337. dependencies:
  338. - 'Openshift Master process not running on {HOST.NAME}'
  339. priority: avg
  340. - name: 'SkyDNS port not listening on {HOST.NAME}'
  341. expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
  342. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  343. dependencies:
  344. - 'Openshift Master process not running on {HOST.NAME}'
  345. priority: high
  346. - name: 'SkyDNS query failed on {HOST.NAME}'
  347. expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1'
  348. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  349. dependencies:
  350. - 'Openshift Master API health check is failing on {HOST.NAME}'
  351. priority: high
  352. - name: 'Hosts not ready according to {HOST.NAME}'
  353. expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0'
  354. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc'
  355. dependencies:
  356. - 'Openshift Master process not running on {HOST.NAME}'
  357. priority: high
  358. zgraphs:
  359. - name: Openshift Master API Server Latency Pods LIST Quantiles
  360. width: 900
  361. height: 200
  362. graph_items:
  363. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5
  364. color: red
  365. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9
  366. color: blue
  367. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99
  368. color: orange
  369. - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles
  370. width: 900
  371. height: 200
  372. graph_items:
  373. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
  374. color: red
  375. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
  376. color: blue
  377. - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
  378. color: orange
  379. - name: Openshift Master Scheduler End to End Latency Quantiles
  380. width: 900
  381. height: 200
  382. graph_items:
  383. - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
  384. color: red
  385. - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
  386. color: blue
  387. - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
  388. color: orange