pbs_anonutils.py 42 KB


  1. # coding: utf-8
  2. # Copyright (C) 1994-2018 Altair Engineering, Inc.
  3. # For more information, contact Altair at www.altair.com.
  4. #
  5. # This file is part of the PBS Professional ("PBS Pro") software.
  6. #
  7. # Open Source License Information:
  8. #
  9. # PBS Pro is free software. You can redistribute it and/or modify it under the
  10. # terms of the GNU Affero General Public License as published by the Free
  11. # Software Foundation, either version 3 of the License, or (at your option) any
  12. # later version.
  13. #
  14. # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
  15. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16. # FOR A PARTICULAR PURPOSE.
  17. # See the GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. #
  22. # Commercial License Information:
  23. #
  24. # For a copy of the commercial license terms and conditions,
  25. # go to: (http://www.pbspro.com/UserArea/agreement.html)
  26. # or contact the Altair Legal Department.
  27. #
  28. # Altair’s dual-license business model allows companies, individuals, and
  29. # organizations to create proprietary derivative works of PBS Pro and
  30. # distribute them - whether embedded or bundled with other software -
  31. # under a commercial license agreement.
  32. #
  33. # Use of Altair’s trademarks, including but not limited to "PBS™",
  34. # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
  35. # trademark licensing policies.
  36. import logging
  37. import os
  38. import copy
  39. import shlex
  40. import re
  41. from ptl.lib.pbs_testlib import BatchUtils, PbsTypeFGCLimit
  42. from ptl.lib.pbs_ifl_mock import *
  43. from ptl.utils.pbs_dshutils import DshUtils
  44. ANON_USER_K = "user"
  45. ANON_GROUP_K = "group"
  46. ANON_HOST_K = "host"
  47. ANON_JOBNAME_K = ATTR_name
  48. ANON_ACCTNAME_K = ATTR_A
  49. class PBSAnonymizer(object):
  50. """
  51. Holds and controls anonymizing operations of PBS data
  52. The anonymizer operates on attributes or resources.
  53. Resources operate on the resource name itself rather than
  54. the entire name, for example, to obfuscate the values associated
  55. to a custom resource "foo" that could be set as resources_available.
  56. foo resources_default.foo or Resource_List.foo, all that needs to be
  57. passed in to the function is "foo" in the list to obfuscate.
  58. :param attr_key: Attributes for which the attribute names themselves
  59. should be obfuscated
  60. :type attr_key: list or None
  61. :param attr_val: Attributes for which the values should be obfuscated
  62. :type attr_val: list or None
  63. :param resc_key: Resources for which the resource names themselves should
  64. be obfuscated
  65. :type resc_key: list or None
  66. :param resc_val: Resources for which the values should be obfuscated
  67. :type resc_val: list or None
  68. """
  69. logger = logging.getLogger(__name__)
  70. utils = BatchUtils()
  71. du = DshUtils()
  72. def __init__(self, attr_delete=None, resc_delete=None,
  73. attr_key=None, attr_val=None,
  74. resc_key=None, resc_val=None):
  75. # special cases
  76. self._entity = False
  77. self.job_sort_formula = None
  78. self.schedselect = None
  79. self.select = None
  80. self.set_attr_delete(attr_delete)
  81. self.set_resc_delete(resc_delete)
  82. self.set_attr_key(attr_key)
  83. self.set_attr_val(attr_val)
  84. self.set_resc_key(resc_key)
  85. self.set_resc_val(resc_val)
  86. self.anonymize = self.anonymize_batch_status
  87. # global anonymized mapping data
  88. self.gmap_attr_val = {}
  89. self.gmap_resc_val = {}
  90. self.gmap_attr_key = {}
  91. self.gmap_resc_key = {}
  92. self.num_bad_acct_records = 0
  93. def __get_anon_key(self, key, attr_map):
  94. """
  95. Get an anonymized string for the 'key' belonging to attr_map
  96. :param key: the key to anonymize
  97. :type key: String
  98. :param attr_map: the attr_map to which the key belongs
  99. :type attr_map: dict
  100. :returns: an anonymized string for the key
  101. """
  102. key = self.__refactor_key(key)
  103. if key in attr_map.keys():
  104. anon_key = attr_map[key]
  105. else:
  106. anon_key = self.utils.random_str(len(key))
  107. attr_map[key] = anon_key
  108. return anon_key
  109. @staticmethod
  110. def __refactor_key(key):
  111. """
  112. There are some attributes which are aliases of each other
  113. and others which are lists like user/group lists, lists of hosts etc.
  114. Set a common key for them.
  115. """
  116. key_lower = key.lower()
  117. if "user" in key_lower or key == "requestor":
  118. key = ANON_USER_K
  119. elif "group" in key_lower:
  120. key = ANON_GROUP_K
  121. elif "host" in key_lower:
  122. key = ANON_HOST_K
  123. elif key == "Name" or key == "Jobname":
  124. key = ANON_JOBNAME_K
  125. elif key == "account":
  126. key = ANON_ACCTNAME_K
  127. return key
  128. def __get_anon_value(self, key, value, kv_map):
  129. """
  130. Get an anonymied string for the 'value' belonging to the kv_map
  131. provided.
  132. The kv_map will be in the following format:
  133. key:{val1:anon_val1, val2:anon_val2, ...}
  134. :param key: the key for this value
  135. :type key: String
  136. :param value: the value to anonymize
  137. :type value: String
  138. :param kv_map: the kv_map to which the key belongs
  139. :type kv_map: dict
  140. :returns: an anonymized string for the value
  141. """
  142. if key == "project" and value == "_pbs_project_default":
  143. return "_pbs_project_default"
  144. # Deal with attributes which have a list of values
  145. if key in (ATTR_u, ATTR_managers, ATTR_M, ATTR_g, ATTR_aclResvhost,
  146. ATTR_aclhost, ATTR_auth_g, ATTR_auth_u):
  147. value_temp = "".join(value.split())
  148. value_list = value_temp.split(",")
  149. elif key == ATTR_exechost:
  150. value_list = []
  151. value_list_temp = value.split("+")
  152. for item in value_list_temp:
  153. value_list.append(item.split("/")[0])
  154. else:
  155. value_list = [value]
  156. key = self.__refactor_key(key)
  157. # Go through the list of values and anonymize each in the value string
  158. for val in value_list:
  159. if "@" in val:
  160. # value if of type "user@host"
  161. # anonymize the user and host parts separately
  162. if ANON_HOST_K in self.attr_val:
  163. try:
  164. user, host = val.split("@")
  165. host = self.__get_anon_value(ANON_HOST_K, host,
  166. self.gmap_attr_val)
  167. user = self.__get_anon_value(ANON_USER_K, user,
  168. self.gmap_attr_val)
  169. anon_val = user + "@" + host
  170. value = value.replace(val, anon_val)
  171. continue
  172. except Exception:
  173. pass
  174. if key in kv_map:
  175. value_map = kv_map[key]
  176. anon_val = self.__get_anon_key(val, value_map)
  177. else:
  178. anon_val = self.utils.random_str(len(val))
  179. kv_map[key] = {val: anon_val}
  180. value = value.replace(val, anon_val)
  181. return value
  182. def _initialize_key_map(self, keys):
  183. k = {}
  184. if keys is not None:
  185. if isinstance(keys, dict):
  186. return keys
  187. elif isinstance(keys, list):
  188. for i in keys:
  189. k[i] = None
  190. elif isinstance(keys, str):
  191. for i in keys.split(","):
  192. k[i] = None
  193. else:
  194. self.logger.error("unhandled map type")
  195. k = {None: None}
  196. return k
  197. def _initialize_value_map(self, keys):
  198. k = {}
  199. if keys is not None:
  200. if isinstance(keys, dict):
  201. return keys
  202. elif isinstance(keys, list):
  203. for i in keys:
  204. k[i] = {}
  205. elif isinstance(keys, str):
  206. for i in keys.split(","):
  207. k[i] = {}
  208. else:
  209. self.logger.error("unhandled map type")
  210. k = {None: None}
  211. return k
  212. def set_attr_delete(self, ad):
  213. """
  214. Name of attributes to delete
  215. :param ad: Attributes to delete
  216. :type ad: str or list or dictionary
  217. """
  218. self.attr_delete = self._initialize_value_map(ad)
  219. def set_resc_delete(self, rd):
  220. """
  221. Name of resources to delete
  222. :param rd: Resources to delete
  223. :type rd: str or list or dictionary
  224. """
  225. self.resc_delete = self._initialize_value_map(rd)
  226. def set_attr_key(self, ak):
  227. """
  228. Name of attributes to obfuscate.
  229. :param ak: Attribute keys
  230. :type ak: str or list or dictionary
  231. """
  232. self.attr_key = self._initialize_key_map(ak)
  233. def set_attr_val(self, av):
  234. """
  235. Name of attributes for which to obfuscate the value
  236. :param av: Attributes value to obfuscate
  237. :type av: str or list or dictionary
  238. """
  239. self.attr_val = self._initialize_value_map(av)
  240. if ("euser" or "egroup" or "project") in self.attr_val:
  241. self._entity = True
  242. def set_resc_key(self, rk):
  243. """
  244. Name of resources to obfuscate
  245. :param rk: Resource key
  246. :type rk: str or list or dictionary
  247. """
  248. self.resc_key = self._initialize_key_map(rk)
  249. def set_resc_val(self, rv):
  250. """
  251. Name of resources for which to obfuscate the value
  252. :param rv: Resource value to obfuscate
  253. :type rv: str or list or dictionary
  254. """
  255. self.resc_val = self._initialize_value_map(rv)
  256. def set_anon_map_file(self, name):
  257. """
  258. Name of file in which to store anonymized map data.
  259. This file is meant to remain private to a site as it
  260. contains the sensitive anonymized data.
  261. :param name: Name of file to which anonymized data to store.
  262. :type name: str
  263. """
  264. self.anon_map_file = name
  265. def anonymize_resource_group(self, filename):
  266. """
  267. Anonymize the user and group fields of a resource
  268. group filename
  269. :param filename: Resource group filename
  270. :type filename: str
  271. """
  272. anon_rg = []
  273. try:
  274. f = open(filename)
  275. lines = f.readlines()
  276. f.close()
  277. except IOError:
  278. self.logger.error("Error processing " + filename)
  279. return None
  280. for data in lines:
  281. data = data.strip()
  282. if data:
  283. if data[0] == "#":
  284. continue
  285. _d = data.split()
  286. ug = _d[0]
  287. if ":" in ug:
  288. (euser, egroup) = ug.split(":")
  289. else:
  290. euser = ug
  291. egroup = None
  292. if "euser" not in self.attr_val:
  293. anon_euser = euser
  294. else:
  295. anon_euser = None
  296. if ANON_USER_K in self.gmap_attr_val:
  297. if euser in self.gmap_attr_val[ANON_USER_K]:
  298. anon_euser = self.gmap_attr_val[ANON_USER_K][euser]
  299. else:
  300. self.gmap_attr_val[ANON_USER_K] = {}
  301. if euser is not None and anon_euser is None:
  302. anon_euser = self.utils.random_str(len(euser))
  303. self.gmap_attr_val[ANON_USER_K][euser] = anon_euser
  304. if "egroup" not in self.attr_val:
  305. anon_egroup = egroup
  306. else:
  307. anon_egroup = None
  308. if egroup is not None:
  309. if ANON_GROUP_K in self.gmap_attr_val:
  310. if egroup in self.gmap_attr_val[ANON_GROUP_K]:
  311. anon_egroup = (self.gmap_attr_val[ANON_GROUP_K]
  312. [egroup])
  313. else:
  314. self.gmap_attr_val[ANON_GROUP_K] = {}
  315. if egroup is not None and anon_egroup is None:
  316. anon_egroup = self.utils.random_str(len(egroup))
  317. self.gmap_attr_val[ANON_GROUP_K][egroup] = anon_egroup
  318. # reconstruct the fairshare info by combining euser and egroup
  319. out = [anon_euser]
  320. if anon_egroup is not None:
  321. out[0] += ":" + anon_egroup
  322. # and appending the rest of the original line
  323. out.append(_d[1])
  324. if len(_d) > 1:
  325. p = _d[2].strip()
  326. if (ANON_USER_K in self.gmap_attr_val and
  327. p in self.gmap_attr_val[ANON_USER_K]):
  328. out.append(self.gmap_attr_val[ANON_USER_K][p])
  329. else:
  330. out.append(_d[2])
  331. if len(_d) > 2:
  332. out += _d[3:]
  333. anon_rg.append(" ".join(out))
  334. return anon_rg
  335. def anonymize_resource_def(self, resources):
  336. """
  337. Anonymize the resource definition
  338. """
  339. if not self.resc_key:
  340. return resources
  341. for curr_anon_resc, val in self.resc_key.items():
  342. if curr_anon_resc in resources:
  343. tmp_resc = copy.copy(resources[curr_anon_resc])
  344. del resources[curr_anon_resc]
  345. if val is None:
  346. if curr_anon_resc in self.gmap_resc_key:
  347. val = self.gmap_resc_key[curr_anon_resc]
  348. else:
  349. val = self.utils.random_str(len(curr_anon_resc))
  350. elif curr_anon_resc not in self.gmap_resc_key:
  351. self.gmap_resc_key[curr_anon_resc] = val
  352. tmp_resc.set_name(val)
  353. resources[val] = tmp_resc
  354. return resources
  355. def __anonymize_fgc(self, d, attr, ar, val):
  356. """
  357. Anonymize an FGC limit value
  358. """
  359. m = {"u": "euser", "g": "egroup", "p": "project"}
  360. if "," in val:
  361. fgc_lim = val.split(",")
  362. else:
  363. fgc_lim = [val]
  364. nfgc = []
  365. for lim in fgc_lim:
  366. _fgc = PbsTypeFGCLimit(attr, lim)
  367. ename = _fgc.entity_name
  368. if ename in ("PBS_GENERIC", "PBS_ALL"):
  369. nfgc.append(lim)
  370. continue
  371. obf_ename = ename
  372. for etype, nm in m.items():
  373. if _fgc.entity_type == etype:
  374. if nm not in self.gmap_attr_val:
  375. if nm in ar and ename in ar[nm]:
  376. obf_ename = ar[nm][ename]
  377. else:
  378. obf_ename = self.utils.random_str(len(ename))
  379. self.gmap_attr_val[nm] = {ename: obf_ename}
  380. elif ename in self.gmap_attr_val[nm]:
  381. if ename in self.gmap_attr_val[nm]:
  382. obf_ename = self.gmap_attr_val[nm][ename]
  383. break
  384. _fgc.entity_name = obf_ename
  385. nfgc.append(_fgc.__val__())
  386. d[attr] = ",".join(nfgc)
  387. def __anonymize_attr_val(self, d, attr, ar, name, val):
  388. """
  389. Obfuscate an attribute/resource values
  390. """
  391. # don't obfuscate default project
  392. if attr == "project" and val == "_pbs_project_default":
  393. return
  394. nstr = []
  395. if "." in attr:
  396. m = self.gmap_resc_val
  397. else:
  398. m = self.gmap_attr_val
  399. if val in ar[name]:
  400. nstr.append(ar[name][val])
  401. if name in self.lmap:
  402. self.lmap[name][val] = ar[name][val]
  403. else:
  404. self.lmap[name] = {val: ar[name][val]}
  405. if name not in m:
  406. m[name] = {val: ar[name][val]}
  407. elif val not in m[name]:
  408. m[name][val] = ar[name][val]
  409. else:
  410. # Obfuscate by randomizing with a value of the same length
  411. tmp_v = val.split(",")
  412. for v in tmp_v:
  413. if v in ar[name]:
  414. r = ar[name][v]
  415. elif name in m and v in m[name]:
  416. r = m[name][v]
  417. else:
  418. r = self.utils.random_str(len(v))
  419. if not isinstance(ar[name], dict):
  420. ar[name] = {}
  421. ar[name][v] = r
  422. self.lmap[name] = {v: r}
  423. if name not in m:
  424. m[name] = {v: r}
  425. elif v not in m[name]:
  426. m[name][v] = r
  427. nstr.append(r)
  428. if d is not None:
  429. d[attr] = ",".join(nstr)
  430. def __anonymize_attr_key(self, d, attr, ar, name, res):
  431. """
  432. Obfuscate an attribute/resource key
  433. """
  434. if res is not None:
  435. m = self.gmap_resc_key
  436. else:
  437. m = self.gmap_attr_key
  438. if not ar[name]:
  439. if name in m:
  440. ar[name] = m[name]
  441. else:
  442. randstr = self.utils.random_str(len(name))
  443. ar[name] = randstr
  444. m[name] = randstr
  445. if d is not None:
  446. tmp_val = d[attr]
  447. del d[attr]
  448. if res is not None:
  449. d[res + "." + ar[name]] = tmp_val
  450. else:
  451. d[ar[name]] = tmp_val
  452. if name not in self.lmap:
  453. self.lmap[name] = ar[name]
  454. if name not in m:
  455. m[name] = ar[name]
  456. def anonymize_batch_status(self, data=None):
  457. """
  458. Anonymize arbitrary batch_status data
  459. :param data: Batch status data
  460. :type data: List or dictionary
  461. """
  462. if not isinstance(data, (list, dict)):
  463. self.logger.error("data expected to be dict or list")
  464. return None
  465. if isinstance(data, dict):
  466. dat = [data]
  467. else:
  468. dat = data
  469. # Local mapping data used to store obfuscation mapping data for this
  470. # specific item, d
  471. self.lmap = {}
  472. # loop over each "batch_status" entry to obfuscate
  473. for d in dat:
  474. if self.attr_delete is not None:
  475. for todel in self.attr_delete:
  476. if todel in d:
  477. del d[todel]
  478. if self.resc_delete is not None:
  479. for todel in self.resc_delete:
  480. for tmpk in d.keys():
  481. if "." in tmpk and todel == tmpk.split(".")[1]:
  482. del d[tmpk]
  483. # Loop over each object's attributes, this is where the special
  484. # cases are handled (e.g., FGC limits, formula, select spec...)
  485. for attr in d:
  486. val = d[attr]
  487. if "." in attr:
  488. (res_type, res_name) = attr.split(".")
  489. else:
  490. res_type = None
  491. res_name = attr
  492. if res_type is not None:
  493. if self._entity and (attr.startswith("max_run") or
  494. attr.startswith("max_queued")):
  495. self.__anonymize_fgc(d, attr, self.attr_val,
  496. val)
  497. if res_name in self.resc_val:
  498. if (attr.startswith("max_run") or
  499. attr.startswith("max_queued")):
  500. self.__anonymize_fgc(d, attr, self.attr_val,
  501. val)
  502. self.__anonymize_attr_val(d, attr, self.resc_val,
  503. res_name, val)
  504. if res_name in self.resc_key:
  505. self.__anonymize_attr_key(d, attr, self.resc_key,
  506. res_name, res_type)
  507. else:
  508. if attr in self.attr_val:
  509. self.__anonymize_attr_val(d, attr, self.attr_val,
  510. attr, val)
  511. if attr in self.attr_key:
  512. self.__anonymize_attr_key(d, attr, self.attr_key,
  513. attr, None)
  514. if ((attr in ("job_sort_formula", "schedselect",
  515. "select")) and self.resc_key):
  516. for r in self.resc_key:
  517. if r in val:
  518. if r not in self.gmap_resc_key:
  519. self.gmap_resc_key[
  520. r] = self.utils.random_str(len(r))
  521. val = val.replace(r, self.gmap_resc_key[r])
  522. setattr(self, attr, val)
  523. d[attr] = val
  524. @staticmethod
  525. def __verify_key(line, key):
  526. """
  527. Verify that a given key is actually a key in the context of the line
  528. given.
  529. :param line: the line to check in
  530. :type line: String
  531. :param key: the key to find
  532. :type key: String
  533. :returns a tuple of (key index, 1st character of key's value)
  534. :returns None if the key is invalid
  535. """
  536. line_len = len(line)
  537. key_len = len(key)
  538. key_index = line.find(key, 0, line_len)
  539. line_nospaces = "".join(line.split())
  540. len_nospaces = len(line_nospaces)
  541. key_idx_nospaces = line_nospaces.find(key, 0, len_nospaces)
  542. value_char = None
  543. # Find all instances of the string representing key in the line
  544. # Find the instance which is a valid key
  545. while key_index >= 0 and key_index < line_len:
  546. valid_key = True
  547. # Make sure that the characters before & after are not alpanum
  548. if key_index != 0:
  549. index_before = key_index - 1
  550. char_before = line[index_before]
  551. if char_before.isalnum() is True:
  552. valid_key = False
  553. else:
  554. char_before = None
  555. if valid_key is True:
  556. if key_index < line_len:
  557. index_after = key_index + key_len
  558. char_after = line[index_after]
  559. if char_after.isalnum() is True:
  560. valid_key = False
  561. else:
  562. char_after = None
  563. if valid_key is True:
  564. # if 'char_after' is not "=", then the characters before
  565. # and after should be the delimiter, and be equal
  566. if char_before is not None and char_after is not None:
  567. if char_after != "=":
  568. if char_before != char_after:
  569. valid_key = False
  570. if valid_key is True:
  571. # Now, let's look at the whitespace stripped line
  572. index_after = key_idx_nospaces + key_len
  573. if index_after >= len_nospaces:
  574. # Nothing after the key, can't be a key
  575. valid_key = False
  576. else:
  577. # Find a valid operator after the key
  578. # valid operators: =, +=, -=, ==
  579. if line_nospaces[index_after] != "=":
  580. # Check for this case: "key +=/-=/== value"
  581. if line_nospaces[index_after] in ("+", "-"):
  582. index_after = index_after + 1
  583. if line_nospaces[index_after] != "=":
  584. valid_key = False
  585. else:
  586. valid_key = False
  587. if valid_key is True:
  588. val_idx_nospaces = index_after + 1
  589. if val_idx_nospaces >= len_nospaces:
  590. # There's no value!, can't be a valid key
  591. valid_key = False
  592. if valid_key is False:
  593. # Find the next instance of the key
  594. key_index = line.find(key, key_index + len(key), line_len)
  595. key_idx_nospaces = line_nospaces.find(key,
  596. key_idx_nospaces +
  597. len(key),
  598. len_nospaces)
  599. else:
  600. # Seems like a valid key!
  601. # Break out of the loop
  602. value_char = line_nospaces[val_idx_nospaces]
  603. break
  604. if key_index == -1 or key_idx_nospaces == -1:
  605. return None
  606. return (key_index, value_char)
  607. def __get_value(self, line, key):
  608. """
  609. Get the 'value' of a kv pair for the key given, from the line given
  610. :param line: the line to search in
  611. :type line: String
  612. :param key: the key for the value
  613. :type key: String
  614. :returns: String containing the value or None
  615. """
  616. # Check if the line is of type:
  617. # <attribute name> = <value>
  618. line_list_spaces = line.split()
  619. if line_list_spaces is not None:
  620. first_word = line_list_spaces[0]
  621. if key == first_word:
  622. # Check that this word is followed by an '=' sign
  623. equals_sign = line_list_spaces[1]
  624. if equals_sign == "=":
  625. # Ok, we are going to assume that this is enough to
  626. # determine that this is the correct type
  627. # return everything after the '=" as value
  628. val_index = line.index("=") + 1
  629. value = line[val_index:].strip()
  630. return value
  631. # Check that a valid instance of this key exists in the string
  632. kv = self.__verify_key(line, key)
  633. if kv is None:
  634. return None
  635. key_index, val_char = kv
  636. # Assumption: the character before the key is the delimiter
  637. # for the k-v pair
  638. delimiter = line[key_index - 1]
  639. if delimiter is None:
  640. # Hard luck, now there's no way to know, let's just assume
  641. # that space is the delimiter and hope for the best
  642. delimiter = " "
  643. # Determine the value's start index
  644. index_after_key = key_index + len(key)
  645. value_index = line[index_after_key:].find(val_char) + index_after_key
  646. # Get the value
  647. lexer = shlex.shlex(line[value_index:], posix=True)
  648. lexer.whitespace = delimiter
  649. lexer.whitespace_split = True
  650. try:
  651. value = lexer.get_token()
  652. except ValueError:
  653. # Sometimes, the data can be incoherent with things like
  654. # Unclosed quotes, which makes get_token() throw an exception
  655. # Just return None
  656. return None
  657. # Strip the value of any trailing whitespaces (like newlines)
  658. value = value.rstrip()
  659. return value
  660. @staticmethod
  661. def __delete_kv(line, key, value):
  662. """
  663. Delete a key-value pair from a line
  664. If after deleting the k-v pair, the left over string has
  665. no alphanumeric characters, then delete the line
  666. :param line: the line in question
  667. :type line: String
  668. :param key: the key ofo the kv pair
  669. :type key: String
  670. :param value: the value of the kv pair
  671. :type value: String
  672. :returns: the line without the kv pair
  673. :returns: None if the line should be deleted
  674. """
  675. key_index = line.find(key)
  676. index_after_key = key_index + len(key)
  677. line_afterkey = line[index_after_key:]
  678. value_index = line_afterkey.find(value) + index_after_key
  679. # find the index of the last character of value
  680. end_index = value_index + len(value)
  681. # Find the start index of the kv pair
  682. # Also include the character before the key
  683. # This will remove an extra delimiter that would be
  684. # left after the kv pair is deleted
  685. start_index = key_index - 1
  686. if start_index < 0:
  687. start_index = 0
  688. # Remove the kv pair
  689. line = line[:start_index] + line[end_index:]
  690. # Check if there's any alphanumeric characters left in the line
  691. if re.search("[A-Za-z0-9]", line) is None:
  692. # Delete the whole line
  693. return None
  694. return line
  695. def __add_alias_attr(self, key, alias_key):
  696. """
  697. Some attributes have aliases. Added alias for a given attribute to the
  698. global maps
  699. :param key: the original attribute
  700. :type key: str
  701. :param alias_key: the alias
  702. :type alias_key: str
  703. """
  704. if key in self.attr_delete:
  705. self.attr_delete[alias_key] = self.attr_delete[key]
  706. if key in self.attr_key:
  707. self.attr_key[alias_key] = self.attr_key[key]
  708. if key in self.attr_val:
  709. self.attr_val[alias_key] = self.attr_val[key]
  710. if key in self.resc_delete:
  711. self.resc_delete[alias_key] = self.resc_delete[key]
  712. if key in self.resc_key:
  713. self.resc_key[alias_key] = self.resc_key[key]
  714. if key in self.resc_val:
  715. self.resc_val[alias_key] = self.resc_val[key]
  716. def anonymize_file_tabular(self, filename, extension=".anon",
  717. inplace=False):
  718. """
  719. Anonymize pbs short format outputs (tabular form)
  720. (e.g - qstat, pbsnodes -aS)
  721. The 'titles' of various columns are used to look up keys inside the
  722. global attribute maps and they are anonymized/removed accordingly.
  723. Warning: only works work PBS tabular outputs, not generic.
  724. :param filename: Name of the file to anonymize
  725. :type filename: str
  726. :param delim: delimiter for the table
  727. :type delim: str
  728. :param extension: Extension of the anonymized file
  729. :type extension: str
  730. :param inplace: If true returns the original file name for
  731. which contents have been replaced
  732. :type inplace: bool
  733. :returns: a str object containing filename of the anonymized file
  734. """
  735. fn = self.du.create_temp_file()
  736. # qstat outputs sometimes have different names for some attributes
  737. self.__add_alias_attr(ATTR_euser, "User")
  738. self.__add_alias_attr(ATTR_euser, "Username")
  739. self.__add_alias_attr(ATTR_name, "Jobname")
  740. self.__add_alias_attr(ATTR_name, "Name")
  741. # pbsnodes -aS output has a 'host' field which should be anonymized
  742. self.__add_alias_attr(ATTR_NODE_Host, "host")
  743. header = None
  744. with open(filename) as f, open(fn, "w") as nf:
  745. # Get the header and the line with '-'s
  746. # Also write out the header and dash lines to the output file
  747. line_num = 0
  748. for line in f:
  749. nf.write(line)
  750. line_num += 1
  751. line_strip = line.strip()
  752. if len(line_strip) == 0:
  753. continue
  754. if line_strip[0].isalpha():
  755. header = line
  756. continue
  757. # Dash line is the line after header
  758. if header is not None:
  759. dash_line = line
  760. break
  761. if header is None: # Couldn't find the header
  762. # Remove the aliases
  763. return filename
  764. # The dash line tells us the length of each column
  765. dash_list = dash_line.split()
  766. col_length = {}
  767. # Store each column's length
  768. col_index = 0
  769. for item in dash_list:
  770. col_len = len(item)
  771. col_length[col_index] = col_len
  772. col_index += 1
  773. # Find out the columns to anonymize/delete
  774. del_columns = []
  775. anon_columns = {}
  776. start_index = 0
  777. end_index = 0
  778. for col_index, length in enumerate(col_length):
  779. start_index = end_index
  780. end_index = start_index + length + 1
  781. # Get the column's title
  782. title = header[start_index:end_index]
  783. title = title.strip()
  784. if title in self.attr_delete.keys():
  785. # Need to delete this whole column
  786. del_columns.append(col_index)
  787. elif title in self.attr_val.keys():
  788. # Need to anonymize all values in the column
  789. anon_columns[col_index] = title
  790. anon_col_keys = anon_columns.keys()
  791. # Go through the file and anonymize/delete columns
  792. for line in f:
  793. start_index = 0
  794. end_index = 0
  795. # Iterate over the different fields
  796. col_index = 0
  797. for col_index in range(len(col_length)):
  798. length = col_length[col_index]
  799. start_index = end_index
  800. end_index = start_index + length
  801. if col_index in del_columns:
  802. # Need to delete the value of this column
  803. # Just replace the value by blank spaces
  804. line2 = list(line)
  805. for i in range(len(line2)):
  806. if i >= start_index and i < end_index:
  807. line2[i] = " "
  808. line = "".join(line2)
  809. elif col_index in anon_col_keys:
  810. # Need to anonymize this column's value
  811. # Get the value
  812. value = line[start_index:end_index]
  813. value_strip = value.strip()
  814. anon_val = self.__get_anon_value(
  815. anon_columns[col_index],
  816. value_strip,
  817. self.gmap_attr_val)
  818. line = line.replace(value_strip, anon_val)
  819. nf.write(line)
  820. if inplace:
  821. out_filename = filename
  822. else:
  823. out_filename = filename + extension
  824. os.rename(fn, out_filename)
  825. return out_filename
  826. def anonymize_file_kv(self, filename, extension=".anon", inplace=False):
  827. """
  828. Anonymize a file which has data in the form of key-value pairs.
  829. Replace every occurrence of any entry in the global
  830. map for the given file by its anonymized values.
  831. :param filename: Name of the file to anonymize
  832. :type filename: str
  833. :param extension: Extension of the anonymized file
  834. :type extension: str
  835. :param inplace: If true returns the original file name for
  836. which contents have been replaced
  837. :type inplace: bool
  838. :returns: a str object containing filename of the anonymized file
  839. """
  840. fn = self.du.create_temp_file()
  841. with open(filename) as f, open(fn, "w") as nf:
  842. delete_line = False
  843. for line in f:
  844. # Check if this is a line extension for an attr being deleted
  845. if delete_line is True and line[0] == "\t":
  846. continue
  847. delete_line = False
  848. # Check if any of the attributes to delete are in the line
  849. for key in self.attr_delete.keys():
  850. if key in line:
  851. value = self.__get_value(line, key)
  852. if value is None:
  853. continue
  854. # Delete the key-value pair
  855. line = self.__delete_kv(line, key, value)
  856. if line is None:
  857. delete_line = True
  858. break
  859. if delete_line is True:
  860. continue
  861. # Anonymize key-value pairs
  862. for key in self.attr_key.keys():
  863. if key in line:
  864. if self.__verify_key(line, key) is None:
  865. continue
  866. anon_key = self.__get_anon_key(key, self.gmap_attr_key)
  867. line = line.replace(key, anon_key)
  868. for key in self.resc_key.keys():
  869. if key in line:
  870. if self.__verify_key(line, key) is None:
  871. continue
  872. anon_key = self.__get_anon_key(key, self.gmap_resc_key)
  873. line = line.replace(key, anon_key)
  874. for key in self.attr_val.keys():
  875. if key in line:
  876. value = self.__get_value(line, key)
  877. if value is None:
  878. continue
  879. anon_value = self.__get_anon_value(key, value,
  880. self.gmap_attr_val)
  881. line = line.replace(value, anon_value)
  882. for key in self.resc_val.keys():
  883. if key in line:
  884. value = self.__get_value(line, key)
  885. if value is None:
  886. continue
  887. anon_value = self.__get_anon_value(key, value,
  888. self.gmap_resc_val)
  889. line = line.replace(value, anon_value)
  890. # Anonymize IP addresses
  891. pattern = re.compile(
  892. "\b*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b*")
  893. match_obj = re.search(pattern, line)
  894. if match_obj:
  895. ip = match_obj.group(0)
  896. anon_key = self.__get_anon_key(ip, self.gmap_attr_key)
  897. line = line.replace(ip, anon_key)
  898. nf.write(line)
  899. if inplace:
  900. out_filename = filename
  901. else:
  902. out_filename = filename + extension
  903. os.rename(fn, out_filename)
  904. return out_filename
  905. def anonymize_accounting_log(self, logfile):
  906. """
  907. Anonymize the accounting log
  908. :param logfile: Acconting log file
  909. :type logfile: str
  910. """
  911. try:
  912. f = open(logfile)
  913. except IOError:
  914. self.logger.error("Error processing " + logfile)
  915. return None
  916. self.__add_alias_attr(ATTR_euser, "user")
  917. self.__add_alias_attr(ATTR_euser, "requestor")
  918. self.__add_alias_attr(ATTR_egroup, "group")
  919. self.__add_alias_attr(ATTR_A, "account")
  920. anon_data = []
  921. for data in f:
  922. # accounting log format is
  923. # %Y/%m/%d %H:%M:%S;<Key>;<Id>;<key1=val1> <key2=val2> ...
  924. curr = data.split(";", 3)
  925. if curr[1] in ("A", "L"):
  926. anon_data.append(data.strip())
  927. continue
  928. buf = shlex.split(curr[3].strip())
  929. skip_record = False
  930. # Split the attribute list into key value pairs
  931. kvl_list = map(lambda n: n.split("=", 1), buf)
  932. for kvl in kvl_list:
  933. try:
  934. k, v = kvl
  935. except ValueError:
  936. self.num_bad_acct_records += 1
  937. self.logger.debug("Bad accounting record found:\n" +
  938. data)
  939. skip_record = True
  940. break
  941. if k in self.attr_val:
  942. anon_kv = self.__get_anon_value(k, v, self.gmap_attr_val)
  943. kvl[1] = anon_kv
  944. if k in self.attr_key:
  945. anon_ak = self.__get_anon_key(k, self.gmap_attr_key)
  946. kvl[0] = anon_ak
  947. if "." in k:
  948. restype, resname = k.split(".")
  949. for rv in self.resc_val:
  950. if resname == rv:
  951. anon_rv = self.__get_anon_value(
  952. resname, rv, self.gmap_resc_val)
  953. kvl[1] = anon_rv
  954. if resname in self.resc_key:
  955. anon_rk = self.__get_anon_key(resname,
  956. self.gmap_resc_key)
  957. kvl[0] = restype + "." + anon_rk
  958. if not skip_record:
  959. anon_data.append(";".join(curr[:3]) + ";" +
  960. " ".join(["=".join(n) for n in kvl_list]))
  961. f.close()
  962. return anon_data
  963. def anonymize_sched_config(self, scheduler):
  964. """
  965. Anonymize the scheduler config
  966. :param scheduler: PBS scheduler object
  967. """
  968. if len(self.resc_key) == 0:
  969. return
  970. # when anonymizing we get rid of the comments as they may contain
  971. # sensitive information
  972. scheduler._sched_config_comments = {}
  973. # If resources need to be anonymized then update the resources line
  974. # job_sort_key and node_sort_key
  975. sr = scheduler.get_resources()
  976. if sr:
  977. for i, sres in enumerate(sr):
  978. if sres in self.resc_key:
  979. if sres in self.gmap_resc_key:
  980. sr[i] = self.gmap_resc_key[sres]
  981. else:
  982. anon_res = self.utils.random_str(len(sres))
  983. self.gmap_resc_key[sres] = anon_res
  984. sr[i] = anon_res
  985. scheduler.sched_config["resources"] = ",".join(sr)
  986. for k in ["job_sort_key", "node_sort_key"]:
  987. if k in scheduler.sched_config:
  988. sc_jsk = scheduler.sched_config[k]
  989. if not isinstance(sc_jsk, list):
  990. sc_jsk = list(sc_jsk)
  991. for r in self.resc_key:
  992. for i, key in enumerate(sc_jsk):
  993. if r in key:
  994. sc_jsk[i] = key.replace(r, self.resc_key[r])
  995. def __str__(self):
  996. return ("Attributes Values: " + str(self.gmap_attr_val) + "\n" +
  997. "Resources Values: " + str(self.gmap_resc_val) + "\n" +
  998. "Attributes Keys: " + str(self.gmap_attr_key) + "\n" +
  999. "Resources Keys: " + str(self.gmap_resc_key))