template_openshift_master.yml 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. ---
  2. g_template_openshift_master:
  3. name: Template Openshift Master
  4. zitems:
  5. - name: create_app
  6. applications:
  7. - Openshift Master
  8. key: create_app
  9. - key: openshift.master.process.count
  10. description: Shows number of master processes running
  11. type: int
  12. applications:
  13. - Openshift Master
  14. - key: openshift.master.api.healthz
  15. description: "Checks the healthz check of the master's api: https://master_host/healthz"
  16. type: bool
  17. applications:
  18. - Openshift Master
  19. - key: openshift.master.user.count
  20. description: Shows number of users in a cluster
  21. type: int
  22. applications:
  23. - Openshift Master
  24. - key: openshift.master.pod.running.count
  25. description: Shows number of pods running
  26. type: int
  27. applications:
  28. - Openshift Master
  29. - key: openshift.master.pod.user.running.count
  30. description: Shows number of user pods running (non infrastructure pods)
  31. type: int
  32. applications:
  33. - Openshift Master
  34. - key: openshift.master.pod.total.count
  35. description: Shows total number of pods (running and non running)
  36. type: int
  37. applications:
  38. - Openshift Master
  39. - key: openshift.project.counter
  40. description: Shows number of projects on a cluster
  41. type: int
  42. applications:
  43. - Openshift Master
  44. - key: openshift.master.etcd.create.success
  45. description: Show number of successful create actions
  46. type: int
  47. applications:
  48. - Openshift Etcd
  49. - key: openshift.master.etcd.create.fail
  50. description: Show number of failed create actions
  51. type: int
  52. applications:
  53. - Openshift Etcd
  54. - key: openshift.master.etcd.delete.success
  55. description: Show number of successful delete actions
  56. type: int
  57. applications:
  58. - Openshift Etcd
  59. - key: openshift.master.etcd.delete.fail
  60. description: Show number of failed delete actions
  61. type: int
  62. applications:
  63. - Openshift Etcd
  64. - key: openshift.master.etcd.get.success
  65. description: Show number of successful get actions
  66. type: int
  67. applications:
  68. - Openshift Etcd
  69. - key: openshift.master.etcd.get.fail
  70. description: Show number of failed get actions
  71. type: int
  72. applications:
  73. - Openshift Etcd
  74. - key: openshift.master.etcd.set.success
  75. description: Show number of successful set actions
  76. type: int
  77. applications:
  78. - Openshift Etcd
  79. - key: openshift.master.etcd.set.fail
  80. description: Show number of failed set actions
  81. type: int
  82. applications:
  83. - Openshift Etcd
  84. - key: openshift.master.etcd.update.success
  85. description: Show number of successful update actions
  86. type: int
  87. applications:
  88. - Openshift Etcd
  89. - key: openshift.master.etcd.update.fail
  90. description: Show number of failed update actions
  91. type: int
  92. applications:
  93. - Openshift Etcd
  94. - key: openshift.master.etcd.watchers
  95. description: Show number of etcd watchers
  96. type: int
  97. applications:
  98. - Openshift Etcd
  99. - key: openshift.master.etcd.ping
  100. description: etcd ping
  101. type: int
  102. applications:
  103. - Openshift Etcd
  104. ztriggers:
  105. - name: 'Application creation has failed on {HOST.NAME}'
  106. expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
  107. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
  108. priority: avg
  109. - name: 'Openshift Master API health check is failing on {HOST.NAME}'
  110. expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
  111. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  112. priority: high
  113. - name: 'Openshift Master process not running on {HOST.NAME}'
  114. expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
  115. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  116. priority: high
  117. - name: 'Too many Openshift Master processes running on {HOST.NAME}'
  118. expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
  119. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  120. priority: high
  121. - name: 'Number of users for Openshift Master on {HOST.NAME}'
  122. expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
  123. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  124. priority: info
  125. - name: 'There are no projects running on {HOST.NAME}'
  126. expression: '{Template Openshift Master:openshift.project.counter.last()}=0'
  127. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
  128. priority: info
  129. - name: 'Low number of etcd watchers on {HOST.NAME}'
  130. expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
  131. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
  132. priority: avg
  133. - name: 'Etcd ping failed on {HOST.NAME}'
  134. expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
  135. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
  136. priority: high