# coding: utf-8 # Copyright (C) 1994-2018 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # PBS Pro is free software. You can redistribute it and/or modify it under the # terms of the GNU Affero General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. # # PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. # See the GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # For a copy of the commercial license terms and conditions, # go to: (http://www.pbspro.com/UserArea/agreement.html) # or contact the Altair Legal Department. # # Altair’s dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of PBS Pro and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair’s trademarks, including but not limited to "PBS™", # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's # trademark licensing policies. import logging import os import copy import shlex import re from ptl.lib.pbs_testlib import BatchUtils, PbsTypeFGCLimit from ptl.lib.pbs_ifl_mock import * from ptl.utils.pbs_dshutils import DshUtils ANON_USER_K = "user" ANON_GROUP_K = "group" ANON_HOST_K = "host" ANON_JOBNAME_K = ATTR_name ANON_ACCTNAME_K = ATTR_A class PBSAnonymizer(object): """ Holds and controls anonymizing operations of PBS data The anonymizer operates on attributes or resources. Resources operate on the resource name itself rather than the entire name, for example, to obfuscate the values associated to a custom resource "foo" that could be set as resources_available. foo resources_default.foo or Resource_List.foo, all that needs to be passed in to the function is "foo" in the list to obfuscate. :param attr_key: Attributes for which the attribute names themselves should be obfuscated :type attr_key: list or None :param attr_val: Attributes for which the values should be obfuscated :type attr_val: list or None :param resc_key: Resources for which the resource names themselves should be obfuscated :type resc_key: list or None :param resc_val: Resources for which the values should be obfuscated :type resc_val: list or None """ logger = logging.getLogger(__name__) utils = BatchUtils() du = DshUtils() def __init__(self, attr_delete=None, resc_delete=None, attr_key=None, attr_val=None, resc_key=None, resc_val=None): # special cases self._entity = False self.job_sort_formula = None self.schedselect = None self.select = None self.set_attr_delete(attr_delete) self.set_resc_delete(resc_delete) self.set_attr_key(attr_key) self.set_attr_val(attr_val) self.set_resc_key(resc_key) self.set_resc_val(resc_val) self.anonymize = self.anonymize_batch_status # global anonymized mapping data self.gmap_attr_val = {} self.gmap_resc_val = {} self.gmap_attr_key = {} self.gmap_resc_key = {} self.num_bad_acct_records = 0 def __get_anon_key(self, key, attr_map): """ Get an anonymized string for the 'key' belonging to attr_map :param key: the key to anonymize :type key: String :param attr_map: the attr_map to which the key belongs :type attr_map: dict :returns: an anonymized string for the key """ key = self.__refactor_key(key) if key in attr_map.keys(): anon_key = attr_map[key] else: anon_key = self.utils.random_str(len(key)) attr_map[key] = anon_key return anon_key @staticmethod def __refactor_key(key): """ There are some attributes which are aliases of each other and others which are lists like user/group lists, lists of hosts etc. Set a common key for them. """ key_lower = key.lower() if "user" in key_lower or key == "requestor": key = ANON_USER_K elif "group" in key_lower: key = ANON_GROUP_K elif "host" in key_lower: key = ANON_HOST_K elif key == "Name" or key == "Jobname": key = ANON_JOBNAME_K elif key == "account": key = ANON_ACCTNAME_K return key def __get_anon_value(self, key, value, kv_map): """ Get an anonymied string for the 'value' belonging to the kv_map provided. The kv_map will be in the following format: key:{val1:anon_val1, val2:anon_val2, ...} :param key: the key for this value :type key: String :param value: the value to anonymize :type value: String :param kv_map: the kv_map to which the key belongs :type kv_map: dict :returns: an anonymized string for the value """ if key == "project" and value == "_pbs_project_default": return "_pbs_project_default" # Deal with attributes which have a list of values if key in (ATTR_u, ATTR_managers, ATTR_M, ATTR_g, ATTR_aclResvhost, ATTR_aclhost, ATTR_auth_g, ATTR_auth_u): value_temp = "".join(value.split()) value_list = value_temp.split(",") elif key == ATTR_exechost: value_list = [] value_list_temp = value.split("+") for item in value_list_temp: value_list.append(item.split("/")[0]) else: value_list = [value] key = self.__refactor_key(key) # Go through the list of values and anonymize each in the value string for val in value_list: if "@" in val: # value if of type "user@host" # anonymize the user and host parts separately if ANON_HOST_K in self.attr_val: try: user, host = val.split("@") host = self.__get_anon_value(ANON_HOST_K, host, self.gmap_attr_val) user = self.__get_anon_value(ANON_USER_K, user, self.gmap_attr_val) anon_val = user + "@" + host value = value.replace(val, anon_val) continue except Exception: pass if key in kv_map: value_map = kv_map[key] anon_val = self.__get_anon_key(val, value_map) else: anon_val = self.utils.random_str(len(val)) kv_map[key] = {val: anon_val} value = value.replace(val, anon_val) return value def _initialize_key_map(self, keys): k = {} if keys is not None: if isinstance(keys, dict): return keys elif isinstance(keys, list): for i in keys: k[i] = None elif isinstance(keys, str): for i in keys.split(","): k[i] = None else: self.logger.error("unhandled map type") k = {None: None} return k def _initialize_value_map(self, keys): k = {} if keys is not None: if isinstance(keys, dict): return keys elif isinstance(keys, list): for i in keys: k[i] = {} elif isinstance(keys, str): for i in keys.split(","): k[i] = {} else: self.logger.error("unhandled map type") k = {None: None} return k def set_attr_delete(self, ad): """ Name of attributes to delete :param ad: Attributes to delete :type ad: str or list or dictionary """ self.attr_delete = self._initialize_value_map(ad) def set_resc_delete(self, rd): """ Name of resources to delete :param rd: Resources to delete :type rd: str or list or dictionary """ self.resc_delete = self._initialize_value_map(rd) def set_attr_key(self, ak): """ Name of attributes to obfuscate. :param ak: Attribute keys :type ak: str or list or dictionary """ self.attr_key = self._initialize_key_map(ak) def set_attr_val(self, av): """ Name of attributes for which to obfuscate the value :param av: Attributes value to obfuscate :type av: str or list or dictionary """ self.attr_val = self._initialize_value_map(av) if ("euser" or "egroup" or "project") in self.attr_val: self._entity = True def set_resc_key(self, rk): """ Name of resources to obfuscate :param rk: Resource key :type rk: str or list or dictionary """ self.resc_key = self._initialize_key_map(rk) def set_resc_val(self, rv): """ Name of resources for which to obfuscate the value :param rv: Resource value to obfuscate :type rv: str or list or dictionary """ self.resc_val = self._initialize_value_map(rv) def set_anon_map_file(self, name): """ Name of file in which to store anonymized map data. This file is meant to remain private to a site as it contains the sensitive anonymized data. :param name: Name of file to which anonymized data to store. :type name: str """ self.anon_map_file = name def anonymize_resource_group(self, filename): """ Anonymize the user and group fields of a resource group filename :param filename: Resource group filename :type filename: str """ anon_rg = [] try: f = open(filename) lines = f.readlines() f.close() except IOError: self.logger.error("Error processing " + filename) return None for data in lines: data = data.strip() if data: if data[0] == "#": continue _d = data.split() ug = _d[0] if ":" in ug: (euser, egroup) = ug.split(":") else: euser = ug egroup = None if "euser" not in self.attr_val: anon_euser = euser else: anon_euser = None if ANON_USER_K in self.gmap_attr_val: if euser in self.gmap_attr_val[ANON_USER_K]: anon_euser = self.gmap_attr_val[ANON_USER_K][euser] else: self.gmap_attr_val[ANON_USER_K] = {} if euser is not None and anon_euser is None: anon_euser = self.utils.random_str(len(euser)) self.gmap_attr_val[ANON_USER_K][euser] = anon_euser if "egroup" not in self.attr_val: anon_egroup = egroup else: anon_egroup = None if egroup is not None: if ANON_GROUP_K in self.gmap_attr_val: if egroup in self.gmap_attr_val[ANON_GROUP_K]: anon_egroup = (self.gmap_attr_val[ANON_GROUP_K] [egroup]) else: self.gmap_attr_val[ANON_GROUP_K] = {} if egroup is not None and anon_egroup is None: anon_egroup = self.utils.random_str(len(egroup)) self.gmap_attr_val[ANON_GROUP_K][egroup] = anon_egroup # reconstruct the fairshare info by combining euser and egroup out = [anon_euser] if anon_egroup is not None: out[0] += ":" + anon_egroup # and appending the rest of the original line out.append(_d[1]) if len(_d) > 1: p = _d[2].strip() if (ANON_USER_K in self.gmap_attr_val and p in self.gmap_attr_val[ANON_USER_K]): out.append(self.gmap_attr_val[ANON_USER_K][p]) else: out.append(_d[2]) if len(_d) > 2: out += _d[3:] anon_rg.append(" ".join(out)) return anon_rg def anonymize_resource_def(self, resources): """ Anonymize the resource definition """ if not self.resc_key: return resources for curr_anon_resc, val in self.resc_key.items(): if curr_anon_resc in resources: tmp_resc = copy.copy(resources[curr_anon_resc]) del resources[curr_anon_resc] if val is None: if curr_anon_resc in self.gmap_resc_key: val = self.gmap_resc_key[curr_anon_resc] else: val = self.utils.random_str(len(curr_anon_resc)) elif curr_anon_resc not in self.gmap_resc_key: self.gmap_resc_key[curr_anon_resc] = val tmp_resc.set_name(val) resources[val] = tmp_resc return resources def __anonymize_fgc(self, d, attr, ar, val): """ Anonymize an FGC limit value """ m = {"u": "euser", "g": "egroup", "p": "project"} if "," in val: fgc_lim = val.split(",") else: fgc_lim = [val] nfgc = [] for lim in fgc_lim: _fgc = PbsTypeFGCLimit(attr, lim) ename = _fgc.entity_name if ename in ("PBS_GENERIC", "PBS_ALL"): nfgc.append(lim) continue obf_ename = ename for etype, nm in m.items(): if _fgc.entity_type == etype: if nm not in self.gmap_attr_val: if nm in ar and ename in ar[nm]: obf_ename = ar[nm][ename] else: obf_ename = self.utils.random_str(len(ename)) self.gmap_attr_val[nm] = {ename: obf_ename} elif ename in self.gmap_attr_val[nm]: if ename in self.gmap_attr_val[nm]: obf_ename = self.gmap_attr_val[nm][ename] break _fgc.entity_name = obf_ename nfgc.append(_fgc.__val__()) d[attr] = ",".join(nfgc) def __anonymize_attr_val(self, d, attr, ar, name, val): """ Obfuscate an attribute/resource values """ # don't obfuscate default project if attr == "project" and val == "_pbs_project_default": return nstr = [] if "." in attr: m = self.gmap_resc_val else: m = self.gmap_attr_val if val in ar[name]: nstr.append(ar[name][val]) if name in self.lmap: self.lmap[name][val] = ar[name][val] else: self.lmap[name] = {val: ar[name][val]} if name not in m: m[name] = {val: ar[name][val]} elif val not in m[name]: m[name][val] = ar[name][val] else: # Obfuscate by randomizing with a value of the same length tmp_v = val.split(",") for v in tmp_v: if v in ar[name]: r = ar[name][v] elif name in m and v in m[name]: r = m[name][v] else: r = self.utils.random_str(len(v)) if not isinstance(ar[name], dict): ar[name] = {} ar[name][v] = r self.lmap[name] = {v: r} if name not in m: m[name] = {v: r} elif v not in m[name]: m[name][v] = r nstr.append(r) if d is not None: d[attr] = ",".join(nstr) def __anonymize_attr_key(self, d, attr, ar, name, res): """ Obfuscate an attribute/resource key """ if res is not None: m = self.gmap_resc_key else: m = self.gmap_attr_key if not ar[name]: if name in m: ar[name] = m[name] else: randstr = self.utils.random_str(len(name)) ar[name] = randstr m[name] = randstr if d is not None: tmp_val = d[attr] del d[attr] if res is not None: d[res + "." + ar[name]] = tmp_val else: d[ar[name]] = tmp_val if name not in self.lmap: self.lmap[name] = ar[name] if name not in m: m[name] = ar[name] def anonymize_batch_status(self, data=None): """ Anonymize arbitrary batch_status data :param data: Batch status data :type data: List or dictionary """ if not isinstance(data, (list, dict)): self.logger.error("data expected to be dict or list") return None if isinstance(data, dict): dat = [data] else: dat = data # Local mapping data used to store obfuscation mapping data for this # specific item, d self.lmap = {} # loop over each "batch_status" entry to obfuscate for d in dat: if self.attr_delete is not None: for todel in self.attr_delete: if todel in d: del d[todel] if self.resc_delete is not None: for todel in self.resc_delete: for tmpk in d.keys(): if "." in tmpk and todel == tmpk.split(".")[1]: del d[tmpk] # Loop over each object's attributes, this is where the special # cases are handled (e.g., FGC limits, formula, select spec...) for attr in d: val = d[attr] if "." in attr: (res_type, res_name) = attr.split(".") else: res_type = None res_name = attr if res_type is not None: if self._entity and (attr.startswith("max_run") or attr.startswith("max_queued")): self.__anonymize_fgc(d, attr, self.attr_val, val) if res_name in self.resc_val: if (attr.startswith("max_run") or attr.startswith("max_queued")): self.__anonymize_fgc(d, attr, self.attr_val, val) self.__anonymize_attr_val(d, attr, self.resc_val, res_name, val) if res_name in self.resc_key: self.__anonymize_attr_key(d, attr, self.resc_key, res_name, res_type) else: if attr in self.attr_val: self.__anonymize_attr_val(d, attr, self.attr_val, attr, val) if attr in self.attr_key: self.__anonymize_attr_key(d, attr, self.attr_key, attr, None) if ((attr in ("job_sort_formula", "schedselect", "select")) and self.resc_key): for r in self.resc_key: if r in val: if r not in self.gmap_resc_key: self.gmap_resc_key[ r] = self.utils.random_str(len(r)) val = val.replace(r, self.gmap_resc_key[r]) setattr(self, attr, val) d[attr] = val @staticmethod def __verify_key(line, key): """ Verify that a given key is actually a key in the context of the line given. :param line: the line to check in :type line: String :param key: the key to find :type key: String :returns a tuple of (key index, 1st character of key's value) :returns None if the key is invalid """ line_len = len(line) key_len = len(key) key_index = line.find(key, 0, line_len) line_nospaces = "".join(line.split()) len_nospaces = len(line_nospaces) key_idx_nospaces = line_nospaces.find(key, 0, len_nospaces) value_char = None # Find all instances of the string representing key in the line # Find the instance which is a valid key while key_index >= 0 and key_index < line_len: valid_key = True # Make sure that the characters before & after are not alpanum if key_index != 0: index_before = key_index - 1 char_before = line[index_before] if char_before.isalnum() is True: valid_key = False else: char_before = None if valid_key is True: if key_index < line_len: index_after = key_index + key_len char_after = line[index_after] if char_after.isalnum() is True: valid_key = False else: char_after = None if valid_key is True: # if 'char_after' is not "=", then the characters before # and after should be the delimiter, and be equal if char_before is not None and char_after is not None: if char_after != "=": if char_before != char_after: valid_key = False if valid_key is True: # Now, let's look at the whitespace stripped line index_after = key_idx_nospaces + key_len if index_after >= len_nospaces: # Nothing after the key, can't be a key valid_key = False else: # Find a valid operator after the key # valid operators: =, +=, -=, == if line_nospaces[index_after] != "=": # Check for this case: "key +=/-=/== value" if line_nospaces[index_after] in ("+", "-"): index_after = index_after + 1 if line_nospaces[index_after] != "=": valid_key = False else: valid_key = False if valid_key is True: val_idx_nospaces = index_after + 1 if val_idx_nospaces >= len_nospaces: # There's no value!, can't be a valid key valid_key = False if valid_key is False: # Find the next instance of the key key_index = line.find(key, key_index + len(key), line_len) key_idx_nospaces = line_nospaces.find(key, key_idx_nospaces + len(key), len_nospaces) else: # Seems like a valid key! # Break out of the loop value_char = line_nospaces[val_idx_nospaces] break if key_index == -1 or key_idx_nospaces == -1: return None return (key_index, value_char) def __get_value(self, line, key): """ Get the 'value' of a kv pair for the key given, from the line given :param line: the line to search in :type line: String :param key: the key for the value :type key: String :returns: String containing the value or None """ # Check if the line is of type: # = line_list_spaces = line.split() if line_list_spaces is not None: first_word = line_list_spaces[0] if key == first_word: # Check that this word is followed by an '=' sign equals_sign = line_list_spaces[1] if equals_sign == "=": # Ok, we are going to assume that this is enough to # determine that this is the correct type # return everything after the '=" as value val_index = line.index("=") + 1 value = line[val_index:].strip() return value # Check that a valid instance of this key exists in the string kv = self.__verify_key(line, key) if kv is None: return None key_index, val_char = kv # Assumption: the character before the key is the delimiter # for the k-v pair delimiter = line[key_index - 1] if delimiter is None: # Hard luck, now there's no way to know, let's just assume # that space is the delimiter and hope for the best delimiter = " " # Determine the value's start index index_after_key = key_index + len(key) value_index = line[index_after_key:].find(val_char) + index_after_key # Get the value lexer = shlex.shlex(line[value_index:], posix=True) lexer.whitespace = delimiter lexer.whitespace_split = True try: value = lexer.get_token() except ValueError: # Sometimes, the data can be incoherent with things like # Unclosed quotes, which makes get_token() throw an exception # Just return None return None # Strip the value of any trailing whitespaces (like newlines) value = value.rstrip() return value @staticmethod def __delete_kv(line, key, value): """ Delete a key-value pair from a line If after deleting the k-v pair, the left over string has no alphanumeric characters, then delete the line :param line: the line in question :type line: String :param key: the key ofo the kv pair :type key: String :param value: the value of the kv pair :type value: String :returns: the line without the kv pair :returns: None if the line should be deleted """ key_index = line.find(key) index_after_key = key_index + len(key) line_afterkey = line[index_after_key:] value_index = line_afterkey.find(value) + index_after_key # find the index of the last character of value end_index = value_index + len(value) # Find the start index of the kv pair # Also include the character before the key # This will remove an extra delimiter that would be # left after the kv pair is deleted start_index = key_index - 1 if start_index < 0: start_index = 0 # Remove the kv pair line = line[:start_index] + line[end_index:] # Check if there's any alphanumeric characters left in the line if re.search("[A-Za-z0-9]", line) is None: # Delete the whole line return None return line def __add_alias_attr(self, key, alias_key): """ Some attributes have aliases. Added alias for a given attribute to the global maps :param key: the original attribute :type key: str :param alias_key: the alias :type alias_key: str """ if key in self.attr_delete: self.attr_delete[alias_key] = self.attr_delete[key] if key in self.attr_key: self.attr_key[alias_key] = self.attr_key[key] if key in self.attr_val: self.attr_val[alias_key] = self.attr_val[key] if key in self.resc_delete: self.resc_delete[alias_key] = self.resc_delete[key] if key in self.resc_key: self.resc_key[alias_key] = self.resc_key[key] if key in self.resc_val: self.resc_val[alias_key] = self.resc_val[key] def anonymize_file_tabular(self, filename, extension=".anon", inplace=False): """ Anonymize pbs short format outputs (tabular form) (e.g - qstat, pbsnodes -aS) The 'titles' of various columns are used to look up keys inside the global attribute maps and they are anonymized/removed accordingly. Warning: only works work PBS tabular outputs, not generic. :param filename: Name of the file to anonymize :type filename: str :param delim: delimiter for the table :type delim: str :param extension: Extension of the anonymized file :type extension: str :param inplace: If true returns the original file name for which contents have been replaced :type inplace: bool :returns: a str object containing filename of the anonymized file """ fn = self.du.create_temp_file() # qstat outputs sometimes have different names for some attributes self.__add_alias_attr(ATTR_euser, "User") self.__add_alias_attr(ATTR_euser, "Username") self.__add_alias_attr(ATTR_name, "Jobname") self.__add_alias_attr(ATTR_name, "Name") # pbsnodes -aS output has a 'host' field which should be anonymized self.__add_alias_attr(ATTR_NODE_Host, "host") header = None with open(filename) as f, open(fn, "w") as nf: # Get the header and the line with '-'s # Also write out the header and dash lines to the output file line_num = 0 for line in f: nf.write(line) line_num += 1 line_strip = line.strip() if len(line_strip) == 0: continue if line_strip[0].isalpha(): header = line continue # Dash line is the line after header if header is not None: dash_line = line break if header is None: # Couldn't find the header # Remove the aliases return filename # The dash line tells us the length of each column dash_list = dash_line.split() col_length = {} # Store each column's length col_index = 0 for item in dash_list: col_len = len(item) col_length[col_index] = col_len col_index += 1 # Find out the columns to anonymize/delete del_columns = [] anon_columns = {} start_index = 0 end_index = 0 for col_index, length in enumerate(col_length): start_index = end_index end_index = start_index + length + 1 # Get the column's title title = header[start_index:end_index] title = title.strip() if title in self.attr_delete.keys(): # Need to delete this whole column del_columns.append(col_index) elif title in self.attr_val.keys(): # Need to anonymize all values in the column anon_columns[col_index] = title anon_col_keys = anon_columns.keys() # Go through the file and anonymize/delete columns for line in f: start_index = 0 end_index = 0 # Iterate over the different fields col_index = 0 for col_index in range(len(col_length)): length = col_length[col_index] start_index = end_index end_index = start_index + length if col_index in del_columns: # Need to delete the value of this column # Just replace the value by blank spaces line2 = list(line) for i in range(len(line2)): if i >= start_index and i < end_index: line2[i] = " " line = "".join(line2) elif col_index in anon_col_keys: # Need to anonymize this column's value # Get the value value = line[start_index:end_index] value_strip = value.strip() anon_val = self.__get_anon_value( anon_columns[col_index], value_strip, self.gmap_attr_val) line = line.replace(value_strip, anon_val) nf.write(line) if inplace: out_filename = filename else: out_filename = filename + extension os.rename(fn, out_filename) return out_filename def anonymize_file_kv(self, filename, extension=".anon", inplace=False): """ Anonymize a file which has data in the form of key-value pairs. Replace every occurrence of any entry in the global map for the given file by its anonymized values. :param filename: Name of the file to anonymize :type filename: str :param extension: Extension of the anonymized file :type extension: str :param inplace: If true returns the original file name for which contents have been replaced :type inplace: bool :returns: a str object containing filename of the anonymized file """ fn = self.du.create_temp_file() with open(filename) as f, open(fn, "w") as nf: delete_line = False for line in f: # Check if this is a line extension for an attr being deleted if delete_line is True and line[0] == "\t": continue delete_line = False # Check if any of the attributes to delete are in the line for key in self.attr_delete.keys(): if key in line: value = self.__get_value(line, key) if value is None: continue # Delete the key-value pair line = self.__delete_kv(line, key, value) if line is None: delete_line = True break if delete_line is True: continue # Anonymize key-value pairs for key in self.attr_key.keys(): if key in line: if self.__verify_key(line, key) is None: continue anon_key = self.__get_anon_key(key, self.gmap_attr_key) line = line.replace(key, anon_key) for key in self.resc_key.keys(): if key in line: if self.__verify_key(line, key) is None: continue anon_key = self.__get_anon_key(key, self.gmap_resc_key) line = line.replace(key, anon_key) for key in self.attr_val.keys(): if key in line: value = self.__get_value(line, key) if value is None: continue anon_value = self.__get_anon_value(key, value, self.gmap_attr_val) line = line.replace(value, anon_value) for key in self.resc_val.keys(): if key in line: value = self.__get_value(line, key) if value is None: continue anon_value = self.__get_anon_value(key, value, self.gmap_resc_val) line = line.replace(value, anon_value) # Anonymize IP addresses pattern = re.compile( "\b*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b*") match_obj = re.search(pattern, line) if match_obj: ip = match_obj.group(0) anon_key = self.__get_anon_key(ip, self.gmap_attr_key) line = line.replace(ip, anon_key) nf.write(line) if inplace: out_filename = filename else: out_filename = filename + extension os.rename(fn, out_filename) return out_filename def anonymize_accounting_log(self, logfile): """ Anonymize the accounting log :param logfile: Acconting log file :type logfile: str """ try: f = open(logfile) except IOError: self.logger.error("Error processing " + logfile) return None self.__add_alias_attr(ATTR_euser, "user") self.__add_alias_attr(ATTR_euser, "requestor") self.__add_alias_attr(ATTR_egroup, "group") self.__add_alias_attr(ATTR_A, "account") anon_data = [] for data in f: # accounting log format is # %Y/%m/%d %H:%M:%S;;; ... curr = data.split(";", 3) if curr[1] in ("A", "L"): anon_data.append(data.strip()) continue buf = shlex.split(curr[3].strip()) skip_record = False # Split the attribute list into key value pairs kvl_list = map(lambda n: n.split("=", 1), buf) for kvl in kvl_list: try: k, v = kvl except ValueError: self.num_bad_acct_records += 1 self.logger.debug("Bad accounting record found:\n" + data) skip_record = True break if k in self.attr_val: anon_kv = self.__get_anon_value(k, v, self.gmap_attr_val) kvl[1] = anon_kv if k in self.attr_key: anon_ak = self.__get_anon_key(k, self.gmap_attr_key) kvl[0] = anon_ak if "." in k: restype, resname = k.split(".") for rv in self.resc_val: if resname == rv: anon_rv = self.__get_anon_value( resname, rv, self.gmap_resc_val) kvl[1] = anon_rv if resname in self.resc_key: anon_rk = self.__get_anon_key(resname, self.gmap_resc_key) kvl[0] = restype + "." + anon_rk if not skip_record: anon_data.append(";".join(curr[:3]) + ";" + " ".join(["=".join(n) for n in kvl_list])) f.close() return anon_data def anonymize_sched_config(self, scheduler): """ Anonymize the scheduler config :param scheduler: PBS scheduler object """ if len(self.resc_key) == 0: return # when anonymizing we get rid of the comments as they may contain # sensitive information scheduler._sched_config_comments = {} # If resources need to be anonymized then update the resources line # job_sort_key and node_sort_key sr = scheduler.get_resources() if sr: for i, sres in enumerate(sr): if sres in self.resc_key: if sres in self.gmap_resc_key: sr[i] = self.gmap_resc_key[sres] else: anon_res = self.utils.random_str(len(sres)) self.gmap_resc_key[sres] = anon_res sr[i] = anon_res scheduler.sched_config["resources"] = ",".join(sr) for k in ["job_sort_key", "node_sort_key"]: if k in scheduler.sched_config: sc_jsk = scheduler.sched_config[k] if not isinstance(sc_jsk, list): sc_jsk = list(sc_jsk) for r in self.resc_key: for i, key in enumerate(sc_jsk): if r in key: sc_jsk[i] = key.replace(r, self.resc_key[r]) def __str__(self): return ("Attributes Values: " + str(self.gmap_attr_val) + "\n" + "Resources Values: " + str(self.gmap_resc_val) + "\n" + "Attributes Keys: " + str(self.gmap_attr_key) + "\n" + "Resources Keys: " + str(self.gmap_resc_key))