|
@@ -3,6 +3,7 @@
|
|
import json
|
|
import json
|
|
import re
|
|
import re
|
|
|
|
|
|
|
|
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
|
|
from openshift_checks.logging.logging import LoggingCheck
|
|
from openshift_checks.logging.logging import LoggingCheck
|
|
|
|
|
|
|
|
|
|
@@ -15,168 +16,178 @@ class Elasticsearch(LoggingCheck):
|
|
def run(self):
|
|
def run(self):
|
|
"""Check various things and gather errors. Returns: result as hash"""
|
|
"""Check various things and gather errors. Returns: result as hash"""
|
|
|
|
|
|
- self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
|
|
|
|
- es_pods, error = self.get_pods_for_component(
|
|
|
|
- self.logging_namespace,
|
|
|
|
- "es",
|
|
|
|
- )
|
|
|
|
- if error:
|
|
|
|
- return {"failed": True, "msg": error}
|
|
|
|
- check_error = self.check_elasticsearch(es_pods)
|
|
|
|
-
|
|
|
|
- if check_error:
|
|
|
|
- msg = ("The following Elasticsearch deployment issue was found:"
|
|
|
|
- "{}".format(check_error))
|
|
|
|
- return {"failed": True, "msg": msg}
|
|
|
|
-
|
|
|
|
|
|
+ es_pods = self.get_pods_for_component("es")
|
|
|
|
+ self.check_elasticsearch(es_pods)
|
|
# TODO(lmeyer): run it all again for the ops cluster
|
|
# TODO(lmeyer): run it all again for the ops cluster
|
|
- return {"failed": False, "msg": 'No problems found with Elasticsearch deployment.'}
|
|
|
|
|
|
|
|
- def _not_running_elasticsearch_pods(self, es_pods):
|
|
|
|
- """Returns: list of pods that are not running, list of errors about non-running pods"""
|
|
|
|
- not_running = self.not_running_pods(es_pods)
|
|
|
|
- if not_running:
|
|
|
|
- return not_running, [(
|
|
|
|
- 'The following Elasticsearch pods are not running:\n'
|
|
|
|
- '{pods}'
|
|
|
|
- 'These pods will not aggregate logs from their nodes.'
|
|
|
|
- ).format(pods=''.join(
|
|
|
|
- " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
|
|
|
|
- for pod in not_running
|
|
|
|
- ))]
|
|
|
|
- return not_running, []
|
|
|
|
|
|
+ return {}
|
|
|
|
|
|
def check_elasticsearch(self, es_pods):
|
|
def check_elasticsearch(self, es_pods):
|
|
- """Various checks for elasticsearch. Returns: error string"""
|
|
|
|
- not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
|
|
|
|
- running_pods = [pod for pod in es_pods if pod not in not_running_pods]
|
|
|
|
|
|
+ """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
|
|
|
|
+ running_pods, errors = self.running_elasticsearch_pods(es_pods)
|
|
pods_by_name = {
|
|
pods_by_name = {
|
|
pod['metadata']['name']: pod for pod in running_pods
|
|
pod['metadata']['name']: pod for pod in running_pods
|
|
# Filter out pods that are not members of a DC
|
|
# Filter out pods that are not members of a DC
|
|
if pod['metadata'].get('labels', {}).get('deploymentconfig')
|
|
if pod['metadata'].get('labels', {}).get('deploymentconfig')
|
|
}
|
|
}
|
|
if not pods_by_name:
|
|
if not pods_by_name:
|
|
- return 'No logging Elasticsearch pods were found. Is logging deployed?'
|
|
|
|
- error_msgs += self._check_elasticsearch_masters(pods_by_name)
|
|
|
|
- error_msgs += self._check_elasticsearch_node_list(pods_by_name)
|
|
|
|
- error_msgs += self._check_es_cluster_health(pods_by_name)
|
|
|
|
- error_msgs += self._check_elasticsearch_diskspace(pods_by_name)
|
|
|
|
- return '\n'.join(error_msgs)
|
|
|
|
|
|
+ # nothing running, cannot run the rest of the check
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'NoRunningPods',
|
|
|
|
+ 'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
|
|
|
|
+ ))
|
|
|
|
+ raise OpenShiftCheckExceptionList(errors)
|
|
|
|
+
|
|
|
|
+ errors += self.check_elasticsearch_masters(pods_by_name)
|
|
|
|
+ errors += self.check_elasticsearch_node_list(pods_by_name)
|
|
|
|
+ errors += self.check_es_cluster_health(pods_by_name)
|
|
|
|
+ errors += self.check_elasticsearch_diskspace(pods_by_name)
|
|
|
|
+ if errors:
|
|
|
|
+ raise OpenShiftCheckExceptionList(errors)
|
|
|
|
+
|
|
|
|
+ def running_elasticsearch_pods(self, es_pods):
|
|
|
|
+ """Returns: list of running pods, list of errors about non-running pods"""
|
|
|
|
+ not_running = self.not_running_pods(es_pods)
|
|
|
|
+ running_pods = [pod for pod in es_pods if pod not in not_running]
|
|
|
|
+ if not_running:
|
|
|
|
+ return running_pods, [OpenShiftCheckException(
|
|
|
|
+ 'PodNotRunning',
|
|
|
|
+ 'The following Elasticsearch pods are defined but not running:\n'
|
|
|
|
+ '{pods}'.format(pods=''.join(
|
|
|
|
+ " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
|
|
|
|
+ for pod in not_running
|
|
|
|
+ ))
|
|
|
|
+ )]
|
|
|
|
+ return running_pods, []
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
def _build_es_curl_cmd(pod_name, url):
|
|
def _build_es_curl_cmd(pod_name, url):
|
|
base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
|
|
base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
|
|
return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
|
|
return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
|
|
|
|
|
|
- def _check_elasticsearch_masters(self, pods_by_name):
|
|
|
|
- """Check that Elasticsearch masters are sane. Returns: list of error strings"""
|
|
|
|
|
|
+ def check_elasticsearch_masters(self, pods_by_name):
|
|
|
|
+ """Check that Elasticsearch masters are sane. Returns: list of errors"""
|
|
es_master_names = set()
|
|
es_master_names = set()
|
|
- error_msgs = []
|
|
|
|
|
|
+ errors = []
|
|
for pod_name in pods_by_name.keys():
|
|
for pod_name in pods_by_name.keys():
|
|
# Compare what each ES node reports as master and compare for split brain
|
|
# Compare what each ES node reports as master and compare for split brain
|
|
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
|
|
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
|
|
- master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, [])
|
|
|
|
|
|
+ master_name_str = self.exec_oc(get_master_cmd, [])
|
|
master_names = (master_name_str or '').split(' ')
|
|
master_names = (master_name_str or '').split(' ')
|
|
if len(master_names) > 1:
|
|
if len(master_names) > 1:
|
|
es_master_names.add(master_names[1])
|
|
es_master_names.add(master_names[1])
|
|
else:
|
|
else:
|
|
- error_msgs.append(
|
|
|
|
- 'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'NoMasterName',
|
|
|
|
+ 'Elasticsearch {pod} gave unexpected response when asked master name:\n'
|
|
' {response}'.format(pod=pod_name, response=master_name_str)
|
|
' {response}'.format(pod=pod_name, response=master_name_str)
|
|
- )
|
|
|
|
|
|
+ ))
|
|
|
|
|
|
if not es_master_names:
|
|
if not es_master_names:
|
|
- error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
|
|
|
|
- return '\n'.join(error_msgs)
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'NoMasterFound',
|
|
|
|
+ 'No logging Elasticsearch masters were found.'
|
|
|
|
+ ))
|
|
|
|
+ return errors
|
|
|
|
|
|
if len(es_master_names) > 1:
|
|
if len(es_master_names) > 1:
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'SplitBrainMasters',
|
|
'Found multiple Elasticsearch masters according to the pods:\n'
|
|
'Found multiple Elasticsearch masters according to the pods:\n'
|
|
'{master_list}\n'
|
|
'{master_list}\n'
|
|
'This implies that the masters have "split brain" and are not correctly\n'
|
|
'This implies that the masters have "split brain" and are not correctly\n'
|
|
'replicating data for the logging cluster. Log loss is likely to occur.'
|
|
'replicating data for the logging cluster. Log loss is likely to occur.'
|
|
.format(master_list='\n'.join(' ' + master for master in es_master_names))
|
|
.format(master_list='\n'.join(' ' + master for master in es_master_names))
|
|
- )
|
|
|
|
|
|
+ ))
|
|
|
|
|
|
- return error_msgs
|
|
|
|
|
|
+ return errors
|
|
|
|
|
|
- def _check_elasticsearch_node_list(self, pods_by_name):
|
|
|
|
- """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
|
|
|
|
|
|
+ def check_elasticsearch_node_list(self, pods_by_name):
|
|
|
|
+ """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
|
|
|
|
|
|
if not pods_by_name:
|
|
if not pods_by_name:
|
|
- return ['No logging Elasticsearch masters were found. Is logging deployed?']
|
|
|
|
|
|
+ return [OpenShiftCheckException(
|
|
|
|
+ 'MissingComponentPods',
|
|
|
|
+ 'No logging Elasticsearch pods were found.'
|
|
|
|
+ )]
|
|
|
|
|
|
# get ES cluster nodes
|
|
# get ES cluster nodes
|
|
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
|
|
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
|
|
- cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, [])
|
|
|
|
|
|
+ cluster_node_data = self.exec_oc(node_cmd, [])
|
|
try:
|
|
try:
|
|
cluster_nodes = json.loads(cluster_node_data)['nodes']
|
|
cluster_nodes = json.loads(cluster_node_data)['nodes']
|
|
except (ValueError, KeyError):
|
|
except (ValueError, KeyError):
|
|
- return [
|
|
|
|
|
|
+ return [OpenShiftCheckException(
|
|
|
|
+ 'MissingNodeList',
|
|
'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
|
|
'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
|
|
cluster_node_data
|
|
cluster_node_data
|
|
- ]
|
|
|
|
|
|
+ )]
|
|
|
|
|
|
# Try to match all ES-reported node hosts to known pods.
|
|
# Try to match all ES-reported node hosts to known pods.
|
|
- error_msgs = []
|
|
|
|
|
|
+ errors = []
|
|
for node in cluster_nodes.values():
|
|
for node in cluster_nodes.values():
|
|
# Note that with 1.4/3.4 the pod IP may be used as the master name
|
|
# Note that with 1.4/3.4 the pod IP may be used as the master name
|
|
if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
|
|
if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
|
|
for pod_name, pod in pods_by_name.items()):
|
|
for pod_name, pod in pods_by_name.items()):
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'EsPodNodeMismatch',
|
|
'The Elasticsearch cluster reports a member node "{node}"\n'
|
|
'The Elasticsearch cluster reports a member node "{node}"\n'
|
|
'that does not correspond to any known ES pod.'.format(node=node['host'])
|
|
'that does not correspond to any known ES pod.'.format(node=node['host'])
|
|
- )
|
|
|
|
|
|
+ ))
|
|
|
|
|
|
- return error_msgs
|
|
|
|
|
|
+ return errors
|
|
|
|
|
|
- def _check_es_cluster_health(self, pods_by_name):
|
|
|
|
|
|
+ def check_es_cluster_health(self, pods_by_name):
|
|
"""Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
|
|
"""Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
|
|
- error_msgs = []
|
|
|
|
|
|
+ errors = []
|
|
for pod_name in pods_by_name.keys():
|
|
for pod_name in pods_by_name.keys():
|
|
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
|
|
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
|
|
- cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, [])
|
|
|
|
|
|
+ cluster_health_data = self.exec_oc(cluster_health_cmd, [])
|
|
try:
|
|
try:
|
|
health_res = json.loads(cluster_health_data)
|
|
health_res = json.loads(cluster_health_data)
|
|
if not health_res or not health_res.get('status'):
|
|
if not health_res or not health_res.get('status'):
|
|
raise ValueError()
|
|
raise ValueError()
|
|
except ValueError:
|
|
except ValueError:
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'BadEsResponse',
|
|
'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
|
|
'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
|
|
'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
|
|
'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
|
|
- )
|
|
|
|
|
|
+ ))
|
|
continue
|
|
continue
|
|
|
|
|
|
if health_res['status'] not in ['green', 'yellow']:
|
|
if health_res['status'] not in ['green', 'yellow']:
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'EsClusterHealthRed',
|
|
'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
|
|
'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
|
|
- )
|
|
|
|
|
|
+ ))
|
|
|
|
|
|
- return error_msgs
|
|
|
|
|
|
+ return errors
|
|
|
|
|
|
- def _check_elasticsearch_diskspace(self, pods_by_name):
|
|
|
|
|
|
+ def check_elasticsearch_diskspace(self, pods_by_name):
|
|
"""
|
|
"""
|
|
Exec into an ES pod and query the diskspace on the persistent volume.
|
|
Exec into an ES pod and query the diskspace on the persistent volume.
|
|
Returns: list of errors
|
|
Returns: list of errors
|
|
"""
|
|
"""
|
|
- error_msgs = []
|
|
|
|
|
|
+ errors = []
|
|
for pod_name in pods_by_name.keys():
|
|
for pod_name in pods_by_name.keys():
|
|
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
|
|
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
|
|
- disk_output = self.exec_oc(self.logging_namespace, df_cmd, [])
|
|
|
|
|
|
+ disk_output = self.exec_oc(df_cmd, [])
|
|
lines = disk_output.splitlines()
|
|
lines = disk_output.splitlines()
|
|
# expecting one header looking like 'IUse% Use%' and one body line
|
|
# expecting one header looking like 'IUse% Use%' and one body line
|
|
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
|
|
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
|
|
if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
|
|
if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'BadDfResponse',
|
|
'Could not retrieve storage usage from logging ES pod "{pod}".\n'
|
|
'Could not retrieve storage usage from logging ES pod "{pod}".\n'
|
|
'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
|
|
'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
|
|
- )
|
|
|
|
|
|
+ ))
|
|
continue
|
|
continue
|
|
inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
|
|
inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
|
|
|
|
|
|
inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
|
|
inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
|
|
if int(inode_pct) >= int(inode_pct_thresh):
|
|
if int(inode_pct) >= int(inode_pct_thresh):
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'InodeUsageTooHigh',
|
|
'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
|
|
'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
|
|
' is {pct}, greater than threshold {limit}.\n'
|
|
' is {pct}, greater than threshold {limit}.\n'
|
|
' Note: threshold can be specified in inventory with {param}'.format(
|
|
' Note: threshold can be specified in inventory with {param}'.format(
|
|
@@ -184,10 +195,11 @@ class Elasticsearch(LoggingCheck):
|
|
pct=str(inode_pct),
|
|
pct=str(inode_pct),
|
|
limit=str(inode_pct_thresh),
|
|
limit=str(inode_pct_thresh),
|
|
param='openshift_check_efk_es_inode_pct',
|
|
param='openshift_check_efk_es_inode_pct',
|
|
- ))
|
|
|
|
|
|
+ )))
|
|
disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
|
|
disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
|
|
if int(disk_pct) >= int(disk_pct_thresh):
|
|
if int(disk_pct) >= int(disk_pct_thresh):
|
|
- error_msgs.append(
|
|
|
|
|
|
+ errors.append(OpenShiftCheckException(
|
|
|
|
+ 'DiskUsageTooHigh',
|
|
'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
|
|
'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
|
|
' is {pct}, greater than threshold {limit}.\n'
|
|
' is {pct}, greater than threshold {limit}.\n'
|
|
' Note: threshold can be specified in inventory with {param}'.format(
|
|
' Note: threshold can be specified in inventory with {param}'.format(
|
|
@@ -195,6 +207,6 @@ class Elasticsearch(LoggingCheck):
|
|
pct=str(disk_pct),
|
|
pct=str(disk_pct),
|
|
limit=str(disk_pct_thresh),
|
|
limit=str(disk_pct_thresh),
|
|
param='openshift_check_efk_es_storage_pct',
|
|
param='openshift_check_efk_es_storage_pct',
|
|
- ))
|
|
|
|
|
|
+ )))
|
|
|
|
|
|
- return error_msgs
|
|
|
|
|
|
+ return errors
|