|
| 1 | +from PII_data_processor import column_has_sufficiently_sparse_strings, clean_column, import_file, export |
| 2 | +from constant_strings import * |
| 3 | +import restricted_words as restricted_words_list |
| 4 | +import query_google_answer_boxes as google |
| 5 | +import requests |
| 6 | +from secret_keys import get_forebears_api_key |
| 7 | +import json |
| 8 | +from datetime import datetime |
| 9 | +import spacy |
| 10 | + |
| 11 | +def get_stopwords(languages=None): |
| 12 | + |
| 13 | + from os import listdir |
| 14 | + from os.path import isfile, join |
| 15 | + |
| 16 | + stopwords_path = './stopwords/' |
| 17 | + |
| 18 | + #If no language selected, get all stopwords |
| 19 | + if(languages == None): |
| 20 | + stopwords_files = [join(stopwords_path, f) for f in listdir(stopwords_path) if isfile(join(stopwords_path, f))] |
| 21 | + else: #Select only stopwords files for given languages |
| 22 | + stopwords_files = [join(stopwords_path, language) for language in languages if isfile(join(stopwords_path, language))] |
| 23 | + |
| 24 | + stopwords_list = [] |
| 25 | + for file_path in stopwords_files: |
| 26 | + with open(file_path, 'r', encoding="utf-8") as reader: |
| 27 | + stopwords = reader.read().split('\n') |
| 28 | + stopwords_list.extend(stopwords) |
| 29 | + |
| 30 | + return list(set(stopwords_list)) |
| 31 | + |
| 32 | +def remove_stopwords(strings_list, languages=['english','spanish']): |
| 33 | + import stopwords |
| 34 | + stop_words = get_stopwords(languages) |
| 35 | + strings_list = [s for s in list(strings_list) if not s in stop_words] |
| 36 | + return strings_list |
| 37 | + |
| 38 | +def find_phone_numbers_in_list_strings(list_strings): |
| 39 | + |
| 40 | + phone_n_regex_str = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})" |
| 41 | + import re |
| 42 | + phone_n_regex = re.compile(phone_n_regex_str) |
| 43 | + phone_numbers_found = list(filter(phone_n_regex.match, list_strings)) |
| 44 | + |
| 45 | + return phone_numbers_found |
| 46 | + |
| 47 | + |
| 48 | +def generate_names_parameter_for_api(list_names, option): |
| 49 | + #According to https://forebears.io/onograph/documentation/api/location/batch |
| 50 | + |
| 51 | + list_of_names_json=[] |
| 52 | + for name in list_names: |
| 53 | + list_of_names_json.append('{"name":"'+name+'","type":"'+option+'","limit":1}') |
| 54 | + |
| 55 | + names_parameter = '['+','.join(list_of_names_json)+']' |
| 56 | + return names_parameter |
| 57 | + |
| 58 | +def get_names_from_json_response(response): |
| 59 | + |
| 60 | + names_found = [] |
| 61 | + |
| 62 | + json_response = json.loads(response) |
| 63 | + for result in json_response["results"]: |
| 64 | + #Names that exist come with the field 'jurisdictions' |
| 65 | + if('jurisdictions' in result): |
| 66 | + names_found.append(result['name']) |
| 67 | + # else: |
| 68 | + # print(result['name']+" is not a name") |
| 69 | + |
| 70 | + return names_found |
| 71 | + |
| 72 | +def filter_based_type_of_word(list_strings, language): |
| 73 | + |
| 74 | + if language == SPANISH: |
| 75 | + nlp = spacy.load("es_core_news_sm") |
| 76 | + |
| 77 | + else: |
| 78 | + nlp = spacy.load("en_core_web_sm") |
| 79 | + |
| 80 | + |
| 81 | + #Accepted types of words |
| 82 | + #Reference https://spacy.io/api/annotation#pos-tagging |
| 83 | + accepted_types = ['PROPN', 'X','PER','LOC','ORG','MISC',''] |
| 84 | + |
| 85 | + filtered_list = [] |
| 86 | + for string in list_strings: |
| 87 | + doc = nlp(string) |
| 88 | + for token in doc: |
| 89 | + if token.pos_ in accepted_types: |
| 90 | + filtered_list.append(token.text) |
| 91 | + |
| 92 | + return list(set(filtered_list)) |
| 93 | + |
| 94 | +def find_names_in_list_string(list_potential_names): |
| 95 | + ''' |
| 96 | + Uses https://forebears.io/onograph/documentation/api/location/batch to find names in list_potential_names |
| 97 | +
|
| 98 | + If this approach seems to be slow or inaccurate, an alternative its to use spacy: |
| 99 | + import spacy |
| 100 | + string = "my name is felipe" |
| 101 | + nlp = spacy.load("en_core_web_md") |
| 102 | + doc = nlp(string) |
| 103 | + for token in doc: |
| 104 | + if (token.ent_type_ == 'PERSON') |
| 105 | + print(token+" is a name") |
| 106 | + ''' |
| 107 | + API_KEY = get_forebears_api_key() |
| 108 | + |
| 109 | + all_names_found = set() |
| 110 | + |
| 111 | + #Api calls must query at most 1,000 names. |
| 112 | + n = 1000 |
| 113 | + list_of_list_1000_potential_names = [list_potential_names[i:i + n] for i in range(0, len(list_potential_names), n)] |
| 114 | + |
| 115 | + for list_1000_potential_names in list_of_list_1000_potential_names: |
| 116 | + #Need to 2 to API calls, one checking forenames and one checking surnames |
| 117 | + for forename_or_surname in ['forename', 'surname']: |
| 118 | + api_url = 'https://ono.4b.rs/v1/jurs?key='+API_KEY |
| 119 | + |
| 120 | + names_parameter = generate_names_parameter_for_api(list_1000_potential_names, forename_or_surname) |
| 121 | + |
| 122 | + response = requests.post(api_url, data={'names':names_parameter}) |
| 123 | + |
| 124 | + names_found = get_names_from_json_response(response.text) |
| 125 | + for name in names_found: |
| 126 | + all_names_found.add(name) |
| 127 | + |
| 128 | + return list(all_names_found) |
| 129 | + |
| 130 | + |
| 131 | +def get_list_unique_strings_in_dataset(dataset, columns_to_check): |
| 132 | + #To make the list, we will go over all columns that have sparse strings |
| 133 | + set_string_in_dataset = set() |
| 134 | + |
| 135 | + #For every column in the dataset |
| 136 | + for column_name in columns_to_check: |
| 137 | + #If column contains strings |
| 138 | + if(column_has_sufficiently_sparse_strings(dataset, column_name)): |
| 139 | + |
| 140 | + #Clean column |
| 141 | + column = clean_column(dataset[column_name]) |
| 142 | + |
| 143 | + for row in column: |
| 144 | + #If row contains more than one word, add each word |
| 145 | + if (' ' in row): |
| 146 | + #For every word in the row |
| 147 | + for word in row.split(" "): |
| 148 | + #Add word to strings to check |
| 149 | + set_string_in_dataset.add(word) |
| 150 | + #If row does not contain spaces, add whole row (its only one string) |
| 151 | + else: |
| 152 | + set_string_in_dataset.add(row) |
| 153 | + |
| 154 | + return list(set_string_in_dataset) |
| 155 | + |
| 156 | +def find_piis(dataset, label_dict, columns_to_check_not_filtered, language): |
| 157 | + |
| 158 | + #Filter columns to those that have sparse entries |
| 159 | + columns_to_check = [] |
| 160 | + for column_name in columns_to_check_not_filtered: |
| 161 | + if column_has_sufficiently_sparse_strings(dataset, column_name): |
| 162 | + columns_to_check.append(column_name) |
| 163 | + |
| 164 | + print("columns_to_check") |
| 165 | + print(columns_to_check) |
| 166 | + |
| 167 | + #Do not check surveyCTO columns |
| 168 | + #columns_to_check = [column for column in dataset.columns if column not in restricted_words_list.get_surveycto_restricted_vars()] |
| 169 | + |
| 170 | + #First we will make a list of all strings that need to be checked |
| 171 | + print("->Getting list of unique strings in dataset...") |
| 172 | + strings_to_check = get_list_unique_strings_in_dataset(dataset, columns_to_check) |
| 173 | + |
| 174 | + #Remove string with less than 3 chars - piis should be longer than that |
| 175 | + print("->Removing strings with less than 3 characters") |
| 176 | + strings_to_check = [s for s in strings_to_check if len(s)>2] |
| 177 | + |
| 178 | + #Find all telephone numbers |
| 179 | + print("-->Finding phone numbers") |
| 180 | + phone_numbers_found = find_phone_numbers_in_list_strings(strings_to_check) |
| 181 | + print("found "+str(len(phone_numbers_found))) |
| 182 | + |
| 183 | + #Update strings_to_check |
| 184 | + strings_to_check = [s for s in strings_to_check if s not in phone_numbers_found] |
| 185 | + |
| 186 | + #Clean list of words, now that we have already found numbers |
| 187 | + print("Length of list "+str(len(strings_to_check))) |
| 188 | + print("->Removing stopwords") |
| 189 | + strings_to_check = remove_stopwords(strings_to_check) |
| 190 | + print("->Filtering based on word type") |
| 191 | + strings_to_check = filter_based_type_of_word(strings_to_check, language) |
| 192 | + print("Length of list "+str(len(strings_to_check))) |
| 193 | + |
| 194 | + #Find all names |
| 195 | + print("->Finding names") |
| 196 | + names_found = find_names_in_list_string(strings_to_check) |
| 197 | + print("found "+str(len(names_found))) |
| 198 | + print(names_found) |
| 199 | + #Update strings_to_check |
| 200 | + strings_to_check = [s for s in strings_to_check if s not in names_found] |
| 201 | + |
| 202 | + #Find all locations with pop less than 20,000 |
| 203 | + print("-->Finding locations with low population") |
| 204 | + locations_with_low_population_found = google.get_locations_with_low_population(strings_to_check) |
| 205 | + print("found "+str(len(locations_with_low_population_found))) |
| 206 | + print(locations_with_low_population_found) |
| 207 | + |
| 208 | + return list(set(phone_numbers_found + names_found + locations_with_low_population_found)) |
| 209 | + |
| 210 | + |
| 211 | + #Find piis in list |
| 212 | + print("->Findind PIIs") |
| 213 | + piis_found = find_piis_in_list_strings(filtered_strings_to_check) |
| 214 | + |
| 215 | + # #Replace found piis found from the dataset |
| 216 | + # print("->Replacing PIIs in new dataset") |
| 217 | + # now = datetime.now() |
| 218 | + # current_time = now.strftime("%H:%M:%S") |
| 219 | + # print("Current Time =", current_time) |
| 220 | + # deidentified_dataset = dataset.replace(piis_found, 'XXXX', regex=True) |
| 221 | + |
| 222 | + # #Save new dataframe |
| 223 | + # print("->Exporting new dataset") |
| 224 | + # now = datetime.now() |
| 225 | + # current_time = now.strftime("%H:%M:%S") |
| 226 | + # print("Current Time =", current_time) |
| 227 | + # new_file_path = export(deidentified_dataset, dataset_path) |
| 228 | + |
| 229 | + print("Task ready!") |
| 230 | + |
| 231 | + return piis_found |
| 232 | + |
| 233 | + |
| 234 | +if __name__ == "__main__": |
| 235 | + |
| 236 | + dataset_path = 'X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw\RECOVR_MEX_r1_Raw.dta' |
| 237 | + |
| 238 | + reading_status, reading_content = import_file(dataset_path) |
| 239 | + |
| 240 | + if(reading_status is False): |
| 241 | + print("Problem importing file") |
| 242 | + |
| 243 | + dataset = reading_content[DATASET] |
| 244 | + label_dict = reading_content[LABEL_DICT] |
| 245 | + |
| 246 | + columns_to_check = [c for c in dataset.columns if c not in restricted_words_list.get_surveycto_restricted_vars()] |
| 247 | + |
| 248 | + find_piis(dataset, label_dict, columns_to_check) |
| 249 | + |
| 250 | + # print(find_names_in_list_string(['Felipe','nombrequenoexiste', 'George', 'Felipe', 'Enriqueta', 'dededede'])) |
| 251 | + |
0 commit comments