sdn.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. """
  2. Check that the SDN is routing traffic properly.
  3. """
  4. import datetime
  5. import os
  6. import textwrap
  7. import time
  8. import ipaddress
  9. from ansible.module_utils import six
  10. from openshift_checks import OpenShiftCheck, OpenShiftCheckException
  11. class SDNCheck(OpenShiftCheck):
  12. """A check to run relevant diagnostics on the SDN."""
  13. name = 'sdn'
  14. tags = ['health']
  15. def is_active(self):
  16. """Skip hosts that are not masters or nodes."""
  17. group_names = self.get_var('group_names', default=[])
  18. master_or_node = 'oo_masters_to_config' in group_names or \
  19. 'oo_nodes_to_config' in group_names
  20. return super(SDNCheck, self).is_active() and master_or_node
  21. def run(self):
  22. if self.want_full_results:
  23. # Gather diagnostic information and perform diagnostics on a master
  24. # or node host.
  25. try:
  26. self.save_journal()
  27. self.save_command_output('nmcli-dev',
  28. ['/bin/nmcli', '--nocheck', '-f', 'all',
  29. 'dev', 'show'])
  30. self.save_command_output('nmcli-con',
  31. ['/bin/nmcli', '--nocheck', '-f', 'all',
  32. 'con', 'show'])
  33. self.save_command_output(
  34. 'ifcfg',
  35. 'head -1000 /etc/sysconfig/network-scripts/ifcfg-*')
  36. self.save_command_output('addresses',
  37. ['/sbin/ip', 'addr', 'show'])
  38. self.save_command_output('routes',
  39. ['/sbin/ip', 'route', 'show'])
  40. self.save_command_output('arp',
  41. ['/sbin/ip', '-s', 'neighbor', 'show'])
  42. self.save_command_output('iptables', ['/sbin/iptables-save'])
  43. self.register_file('hosts', None, '/etc/hosts')
  44. self.register_file('resolv.conf', None, '/etc/resolv.conf')
  45. self.save_command_output('modules', ['/sbin/lsmod'])
  46. self.save_command_output('sysctl', ['/sbin/sysctl', '-a'])
  47. if self.get_var('openshift_use_crio', default=False):
  48. self.save_command_output('crio-version',
  49. ['/bin/crictl', 'version'])
  50. if not self.get_var('openshift_use_crio_only', default=False):
  51. self.save_command_output('docker-version',
  52. ['/bin/docker', 'version'])
  53. self.save_command_output('oc-version', ['/bin/oc', 'version'])
  54. self.register_file('os-version', None,
  55. '/etc/system-release-cpe')
  56. except OpenShiftCheckException as exc:
  57. self.register_failure(exc)
  58. group_names = self.get_var('group_names', default=[])
  59. if 'oo_masters_to_config' in group_names:
  60. self.check_master()
  61. if 'oo_nodes_to_config' in group_names:
  62. self.check_node()
  63. return {}
  64. def save_journal(self):
  65. """Save the last 5 minutes of the journal."""
  66. out = self.read_command_output(['/bin/journalctl', '-n', '1', '-q'])
  67. (since, until) = SDNCheck.compute_log_interval_from(out)
  68. self.register_file('journal',
  69. self.read_command_output(['/bin/journalctl',
  70. '"--since=%s"' % since,
  71. '"--until=%s"' % until]))
  72. @staticmethod
  73. def compute_log_interval_from(log_line):
  74. """Compute and return a 2-tuple of timestamps (ts1, ts2) where ts1
  75. represents the date 5 minutes prior to the timestamp of the provided log
  76. message and ts2 represents the date of that timestamp. The log line is
  77. assumed to be from today."""
  78. try:
  79. log_ts = log_line.strip().split()[2]
  80. ts2_time = datetime.datetime.strptime(log_ts, '%H:%M:%S').time()
  81. now = datetime.datetime.now()
  82. ts2_date = datetime.datetime.combine(now, ts2_time)
  83. ts1_date = ts2_date - datetime.timedelta(minutes=5)
  84. time_fmt = '%Y-%m-%d %H:%M:%S'
  85. # pylint may infer that ts1_date is NotImplemented or a timedelta
  86. # object and complain about using the timetuple method from the
  87. # datetime class because the subtraction operation above would
  88. # return a timedelta if the RHS were a date or NotImplemented if the
  89. # RHS were neither a datetime nor a timedelta. However,
  90. # datetime.datetime.combine cannot return a non-datetime value, and
  91. # datetime.timedelta(minutes=5) is an explicit timedelta value,
  92. # so the subtraction really can only return a datetime value.
  93. # pylint: disable=no-member
  94. ts1 = time.strftime(time_fmt, ts1_date.timetuple())
  95. ts2 = time.strftime(time_fmt, ts2_date.timetuple())
  96. except (ValueError, IndexError):
  97. ts1 = '-5m'
  98. ts2 = 'now'
  99. return (ts1, ts2)
  100. def save_command_output(self, path, command):
  101. """Execute the provided command using the command module
  102. and save its output to the specified file.
  103. If the command is a string, use a shell. Otherwise, assume the command
  104. is a list, join it with spaces, and execute it without shell.
  105. """
  106. self.register_file(path, self.read_command_output(command))
  107. def read_command_output(self, command):
  108. """Execute the provided command using the command module
  109. and return its output.
  110. If the command is a string, use a shell. Otherwise, assume the command
  111. is a list, join it with spaces, and execute it without shell.
  112. """
  113. uses_shell = False
  114. if isinstance(command, six.string_types):
  115. uses_shell = True
  116. else:
  117. command = ' '.join(command)
  118. command_args = dict(_raw_params=command, _uses_shell=uses_shell)
  119. # Use self._execute_module instead of self.execute_module because
  120. # the latter sets self.changed.
  121. result = self._execute_module('command', command_args)
  122. if result.get('rc', 0) != 0 or result.get('failed'):
  123. raise OpenShiftCheckException(
  124. 'RemoteCommandFailure',
  125. 'Failed to execute command on remote host: %s' % command)
  126. return result['stdout'].encode('utf-8')
  127. def check_master(self):
  128. """Gather diagnostic information on a master and ensure it can connect
  129. to kubelets."""
  130. if self.want_full_results:
  131. conf_base_path = self.get_var('openshift.common.config_base')
  132. master_conf_path = os.path.join(conf_base_path, 'master',
  133. 'master-config.yaml')
  134. self.register_file('master-config.yaml', None, master_conf_path)
  135. self.save_component_container_logs('controllers', 'controllers')
  136. self.save_component_container_logs('api', 'api')
  137. nodes = self.get_resource('nodes')
  138. if self.want_full_results:
  139. self.register_file('nodes.json', nodes)
  140. self.register_file('pods.json', self.get_resource('pods'))
  141. self.register_file('services.json', self.get_resource('services'))
  142. self.register_file('endpoints.json', self.get_resource('endpoints'))
  143. self.register_file('routes.json', self.get_resource('routes'))
  144. self.register_file('clusternetworks.json',
  145. self.get_resource('clusternetworks'))
  146. self.register_file('hostsubnets.json',
  147. self.get_resource('hostsubnets'))
  148. self.register_file('netnamespaces.json',
  149. self.get_resource('netnamespaces'))
  150. if not nodes:
  151. self.register_failure(
  152. 'No nodes appear to be defined according to the API.'
  153. )
  154. for node in nodes:
  155. self.check_node_kubelet(node)
  156. def save_component_container_logs(self, component, container):
  157. """Save the first and last 2000 lines of logs for the specified
  158. component and container."""
  159. awk_script = textwrap.dedent('''\
  160. BEGIN {
  161. n = 2000
  162. }
  163. NR <= n {
  164. print
  165. }
  166. NR > n {
  167. buf[(NR - 1)%n + 1] = $0
  168. }
  169. END {
  170. if (NR <= n)
  171. exit
  172. if (NR > 2*n)
  173. print "..."
  174. for (i = NR >= 2*n ? 0 : n - NR%n; i < n; ++i)
  175. print buf[(NR + i)%n + 1]
  176. }''')
  177. out = self.read_command_output(' '.join(['/usr/local/bin/master-logs',
  178. component, container, '2>&1',
  179. '|', '/bin/awk',
  180. "'%s'" % awk_script]))
  181. self.register_file('master-logs_%s_%s' % (component, container), out)
  182. def get_resource(self, kind):
  183. """Return a list of all resources of the specified kind."""
  184. for resource in self.task_vars['resources']['results']:
  185. if resource['item'] == kind:
  186. return resource['results']['results'][0]['items']
  187. raise OpenShiftCheckException('CouldNotListResource',
  188. 'Could not list resource %s' % kind)
  189. def check_node(self):
  190. """Gather diagnostic information on a node and perform connectivity
  191. checks on pods and services."""
  192. node_name = self.get_var('openshift', 'node', 'nodename', default=None)
  193. if not node_name:
  194. self.register_failure('Could not determine node name.')
  195. return
  196. # The "openvswitch" container uses the host netnamespace, but the host
  197. # file system may not have the ovs-appctl and ovs-ofctl binaries, which
  198. # we use for some diagnostics. Thus we run these binaries inside the
  199. # container, and to that end, we need to determine its container id.
  200. exec_in_ovs_container = self.get_container_exec_command('openvswitch',
  201. 'openshift-sdn')
  202. if self.want_full_results:
  203. try:
  204. service_prefix = self.get_var('openshift_service_type')
  205. if self._templar is not None:
  206. service_prefix = self._templar.template(service_prefix)
  207. self.save_service_logs('%s-node' % service_prefix)
  208. if self.get_var('openshift_use_crio', default=False):
  209. self.save_command_output('crio-unit-file',
  210. ['/bin/systemctl',
  211. 'cat', 'crio.service'])
  212. self.save_command_output('crio-ps', ['/bin/crictl', 'ps'])
  213. if not self.get_var('openshift_use_crio_only', default=False):
  214. self.save_command_output('docker-unit-file',
  215. ['/bin/systemctl',
  216. 'cat', 'docker.service'])
  217. self.save_command_output('docker-ps', ['/bin/docker', 'ps'])
  218. self.save_command_output('flows', exec_in_ovs_container +
  219. ['/bin/ovs-ofctl', '-O', 'OpenFlow13',
  220. 'dump-flows', 'br0'])
  221. self.save_command_output('ovs-show', exec_in_ovs_container +
  222. ['/bin/ovs-ofctl', '-O', 'OpenFlow13',
  223. 'show', 'br0'])
  224. self.save_command_output('tc-qdisc',
  225. ['/sbin/tc', 'qdisc', 'show'])
  226. self.save_command_output('tc-class',
  227. ['/sbin/tc', 'class', 'show'])
  228. self.save_command_output('tc-filter',
  229. ['/sbin/tc', 'filter', 'show'])
  230. except OpenShiftCheckException as exc:
  231. self.register_failure(exc)
  232. subnets = {hostsubnet['metadata']['name']: hostsubnet['subnet']
  233. for hostsubnet in self.get_resource('hostsubnets')}
  234. subnet = subnets.get(node_name, None)
  235. if subnet is None:
  236. self.register_failure('Node %s has no hostsubnet.' % node_name)
  237. return
  238. subnet = six.text_type(subnet)
  239. address = ipaddress.ip_network(subnet)[1]
  240. for remote_node in self.get_resource('nodes'):
  241. remote_node_name = remote_node['metadata']['name']
  242. if remote_node_name == node_name:
  243. continue
  244. remote_subnet = subnets.get(remote_node_name, None)
  245. if remote_subnet is None:
  246. continue
  247. remote_subnet = six.text_type(remote_subnet)
  248. remote_address = ipaddress.ip_network(remote_subnet)[1]
  249. self.save_command_output(
  250. 'trace_node_%s_to_node_%s' % (node_name, remote_node_name),
  251. exec_in_ovs_container +
  252. ['/bin/ovs-appctl', 'ofproto/trace', 'br0',
  253. 'in_port=2,reg0=0,ip,nw_src=%s,nw_dst=%s' %
  254. (address, remote_address)])
  255. try:
  256. self.save_command_output('ping_node_%s_to_node_%s' %
  257. (node_name, remote_node_name),
  258. ['/bin/ping', '-c', '1', '-W', '2',
  259. str(remote_address)])
  260. except OpenShiftCheckException as exc:
  261. self.register_failure('Node %s cannot ping node %s.' %
  262. (node_name, remote_node_name))
  263. def get_container_exec_command(self, container_name, namespace):
  264. """Return an array comprising a command and arguments that can be used
  265. to execute commands inside the specified container running in a pod in
  266. the specified namespace."""
  267. if self.get_var('openshift_use_crio', default=False):
  268. container_id = self.read_command_output([
  269. '/bin/crictl', 'ps', '-l', '-a', '-q',
  270. '--label=io.kubernetes.container.name=%s' % container_name,
  271. '--label=io.kubernetes.pod.namespace=%s' % namespace
  272. ])
  273. command = ['/bin/crictl', 'exec', container_id]
  274. else:
  275. container_id = self.read_command_output([
  276. '/bin/docker', 'ps', '-l', '-a', '-q',
  277. '--filter=label=io.kubernetes.container.name=%s'
  278. % container_name,
  279. '--filter=label=io.kubernetes.pod.namespace=%s' % namespace
  280. ])
  281. command = ['/bin/docker', 'exec', container_id]
  282. return command
  283. def save_service_logs(self, service_name):
  284. """Save the first 5 minutes of logs after the specified service started
  285. and the last 5 minutes of logs for that service."""
  286. time_fmt = '%Y-%m-%d %H:%M:%S'
  287. out = self.read_command_output(['systemctl', 'show', service_name,
  288. '-p', 'ExecMainStartTimestamp'])
  289. start_timestamp = out.strip().split('=', 1)[1]
  290. if len(start_timestamp) == 0:
  291. self.register_failure('%s is not started.' % service_name)
  292. return
  293. # The timestamp should be in the format "%a %Y-%m-%d %H:%M:%S %Z".
  294. # However, Python cannot reliably parse timezone names
  295. # (see <https://bugs.python.org/issue22377>), so we must drop the
  296. # timezone name before parsing the timestamp.
  297. start_timestamp = ' '.join(start_timestamp.split()[0:3])
  298. since_date = datetime.datetime.strptime(start_timestamp,
  299. '%a %Y-%m-%d %H:%M:%S')
  300. until_date = since_date + datetime.timedelta(minutes=5)
  301. since = since_date.strftime(time_fmt)
  302. until = until_date.strftime(time_fmt)
  303. start_logs = self.read_command_output(['/bin/journalctl',
  304. '"--since=%s"' % since,
  305. '"--until=%s"' % until])
  306. out = self.read_command_output(['/bin/journalctl', '-u', service_name,
  307. '-n', '1', '-q'])
  308. (since, until) = SDNCheck.compute_log_interval_from(out)
  309. last_logs = self.read_command_output(['/bin/journalctl',
  310. '-u', service_name,
  311. '"--since=%s"' % since,
  312. '"--until=%s"' % until])
  313. self.register_file(service_name, (start_logs + '\n...\n' + last_logs))
  314. def check_node_kubelet(self, node):
  315. """Check that the host can find the address of the given node, resolve
  316. that address, and connect to the node's kubelet."""
  317. name = node['metadata']['name']
  318. preferred_addr = SDNCheck.get_node_preferred_address(node)
  319. if not preferred_addr:
  320. self.register_failure('Node %s: no preferred address' % name)
  321. return
  322. internal_addr = None
  323. for address in node.get('status', {}).get('addresses', []):
  324. if address.get('type') == 'InternalIP':
  325. internal_addr = address.get('address')
  326. break
  327. if not internal_addr:
  328. self.register_failure('Node %s: no IP address in OpenShift' % name)
  329. else:
  330. try:
  331. resolved_addr = self.resolve_address(preferred_addr)
  332. except OpenShiftCheckException as exc:
  333. self.register_failure(exc)
  334. else:
  335. if resolved_addr != internal_addr:
  336. self.register_failure(
  337. ('Node %s: the IP address in OpenShift (%s)' +
  338. ' does not match DNS/hosts (%s)') %
  339. (name, internal_addr, resolved_addr))
  340. url = 'http://%s:%d' % (preferred_addr, 10250)
  341. result = self.execute_module('uri', dict(url=url))
  342. if result.get('rc', 0) != 0 or result.get('failed'):
  343. self.register_failure(
  344. 'Kubelet on node %s is not responding: %s' %
  345. (name, result.get('msg', 'unknown error')))
  346. @staticmethod
  347. def get_node_preferred_address(node):
  348. """Return a host name or address for the given node, or None.
  349. The host name or address is selected from the node's status.addresses
  350. field in accordance with the preference order used by the OpenShift
  351. master."""
  352. preferred_address_types = ['Hostname', 'InternalIP', 'ExternalIP']
  353. for address_type in preferred_address_types:
  354. for address in node.get('status', {}).get('addresses', []):
  355. if address.get('type') == address_type:
  356. return address.get('address')
  357. if address_type == 'Hostname':
  358. hostname = node.get('metadata', {}) \
  359. .get('labels', {}) \
  360. .get('kubernetes.io/hostname', "")
  361. if len(hostname) > 0:
  362. return hostname
  363. return None
  364. def resolve_address(self, addr):
  365. """Look up the given IPv4 address using getent."""
  366. command = ' '.join(['/bin/getent', 'ahostsv4', addr])
  367. try:
  368. out = self.read_command_output(command)
  369. except OpenShiftCheckException as exc:
  370. raise OpenShiftCheckException(
  371. 'NameResolutionError',
  372. 'Cannot resolve node %s: %s' % (addr, exc))
  373. for line in out.splitlines():
  374. record = line.split()
  375. if record[1:3] == ['STREAM', addr]:
  376. return record[0]
  377. return None