123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389 |
- ---
- g_template_openshift_master:
- name: Template Openshift Master
- zitems:
- - name: create_app
- applications:
- - Openshift Master
- key: create_app
- - key: openshift.master.registry.healthz
- description: "Shows the health status of the cluster's docker registry"
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.process.count
- description: Shows number of master processes running
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.api.ping
- description: "Verify that the Openshift API is up (uses the cluster API URL)"
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.local.api.ping
- description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)"
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.api.healthz
- description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz"
- type: int
- data_type: bool
- applications:
- - Openshift Master
- - key: openshift.master.local.api.healthz
- description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz"
- type: int
- data_type: bool
- applications:
- - Openshift Master
- - key: openshift.master.user.count
- description: Shows number of users in a cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pod.running.count
- description: Shows number of pods running
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pod.user.running.count
- description: Shows number of user pods running (non infrastructure pods)
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pod.total.count
- description: Shows total number of pods (running and non running)
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.node.count
- description: Shows the total number of nodes found in the Openshift Cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.project.count
- description: Shows number of projects on a cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pv.total.count
- description: Total number of Persistent Volumes in the Openshift Cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pv.available.count
- description: Total number of Available Persistent Volumes in the Openshift Cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pv.released.count
- description: Total number of Released Persistent Volumes in the Openshift Cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pv.bound.count
- description: Total number of Bound Persistent Volumes in the Openshift Cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.pv.failed.count
- description: Total number of Failed Persistent Volumes in the Openshift Cluster
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.skydns.port.open
- description: State of the SkyDNS port open and listening
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.skydns.query
- description: SkyDNS can be queried or not
- type: int
- applications:
- - Openshift Master
- - key: openshift.master.etcd.create.success
- description: Show number of successful create actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.create.fail
- description: Show number of failed create actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.delete.success
- description: Show number of successful delete actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.delete.fail
- description: Show number of failed delete actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.get.success
- description: Show number of successful get actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.get.fail
- description: Show number of failed get actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.set.success
- description: Show number of successful set actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.set.fail
- description: Show number of failed set actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.update.success
- description: Show number of successful update actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.update.fail
- description: Show number of failed update actions
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.watchers
- description: Show number of etcd watchers
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.etcd.ping
- description: etcd ping
- type: int
- applications:
- - Openshift Etcd
- - key: openshift.master.metric.ping
- description: "This check verifies that the https://master/metrics check is alive and communicating properly."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
- description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed."
- type: int
- applications:
- - Openshift Master Metrics
- ztriggers:
- - name: 'Openshift Master process not running on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
- - name: 'Too many Openshift Master processes running on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
- - name: 'Low number of etcd watchers on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
- priority: avg
- - name: 'Etcd ping failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc'
- priority: high
- - name: 'Number of users for Openshift Master on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: info
- - name: 'There are no projects running on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.project.count.last()}=0'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: info
- # Put triggers that depend on other triggers here (deps must be created first)
- - name: 'Application creation has failed on {HOST.NAME}'
- expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: avg
- - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}'
- expression: '{Template Openshift Master:create_app.sum(1h)}>3'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- description: The application create loop has failed 4 or more times in the last hour
- priority: avg
- - name: 'Openshift Master API health check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
- - name: 'Openshift Master Local API health check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
- - name: 'Openshift Master API PING check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- priority: high
- - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
- - name: 'Openshift Master metric PING check is failing on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: avg
- - name: 'Docker Registry check failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
- - name: 'SkyDNS port not listening on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master process not running on {HOST.NAME}'
- priority: high
- - name: 'SkyDNS query failed on {HOST.NAME}'
- expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1'
- url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
- dependencies:
- - 'Openshift Master API health check is failing on {HOST.NAME}'
- priority: high
- zgraphs:
- - name: Openshift Master API Server Latency Pods LIST Quantiles
- width: 900
- height: 200
- graph_items:
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5
- color: red
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9
- color: blue
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99
- color: orange
- - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles
- width: 900
- height: 200
- graph_items:
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5
- color: red
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9
- color: blue
- - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99
- color: orange
- - name: Openshift Master Scheduler End to End Latency Quantiles
- width: 900
- height: 200
- graph_items:
- - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5
- color: red
- - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9
- color: blue
- - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99
- color: orange
|