template_openshift_node.yml 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. ---
  2. g_template_openshift_node:
  3. name: Template Openshift Node
  4. zitems:
  5. - key: openshift.node.process.count
  6. description: Shows number of OpenShift Node processes running
  7. type: int
  8. applications:
  9. - Openshift Node
  10. - key: openshift.node.ovs.pids.count
  11. description: Shows number of ovs process ids running
  12. type: int
  13. applications:
  14. - Openshift Node
  15. - key: openshift.node.ovs.ports.count
  16. description: Shows number of OVS ports defined
  17. type: int
  18. applications:
  19. - Openshift Node
  20. - key: openshift.node.ovs.stray.rules
  21. description: Number of OVS stray rules found/removed
  22. type: int
  23. applications:
  24. - Openshift Node
  25. - key: openshift.node.registry-pods.healthy_pct
  26. description: Shows the percentage of healthy registries in the cluster
  27. type: int
  28. applications:
  29. - Openshift Node
  30. - key: openshift.node.registry.service.ping
  31. description: Ping docker-registry service from node
  32. type: int
  33. applications:
  34. - Openshift Node
  35. ztriggers:
  36. - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}'
  37. expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100'
  38. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
  39. priority: avg
  40. - name: 'Docker Registry service is unhealthy according to {HOST.NAME}'
  41. expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1'
  42. url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc'
  43. priority: avg
  44. - name: 'Openshift Node process not running on {HOST.NAME}'
  45. expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
  46. url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
  47. priority: high
  48. - name: 'Too many Openshift Node processes running on {HOST.NAME}'
  49. expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
  50. url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
  51. priority: high
  52. - name: '[HEAL] OVS may not be running on {HOST.NAME}'
  53. expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4'
  54. url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
  55. priority: high
  56. - name: 'Number of OVS ports is 0 on {HOST.NAME}'
  57. expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
  58. url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
  59. priority: high
  60. zactions:
  61. - name: '[HEAL] OVS may not be running on {HOST.NAME}'
  62. status: disabled
  63. escalation_time: 60
  64. conditions_filter:
  65. calculation_type: "and/or"
  66. conditions:
  67. - conditiontype: maintenance status
  68. operator: not in
  69. - conditiontype: trigger name
  70. operator: like
  71. value: "[HEAL] OVS may not be running on"
  72. - conditiontype: trigger value
  73. operator: "="
  74. value: PROBLEM
  75. operations:
  76. - esc_step_from: 1
  77. esc_step_to: 1
  78. esc_period: 0
  79. operationtype: remote command
  80. opcommand:
  81. command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"'
  82. execute_on: "zabbix server"
  83. type: 'custom script'
  84. target_hosts:
  85. - target_type: 'zabbix server'
  86. opconditions:
  87. - conditiontype: 'event acknowledged'
  88. operator: '='
  89. value: 'not acknowledged'