Skip to content

Commit e2c208d

Browse files
committed
Uploading new version and adding pending files
1 parent 4e29e43 commit e2c208d

30 files changed

+7194
-301
lines changed

README.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@ Once the PIIs are identified, users have the opportunity to say what they would
1717

1818
### Files included
1919

20+
#### Main files
21+
* app_frontend.py: App GUI script using tkinter.
2022
* PII_data_processor.py: App backend, it reads data files, identifies PIIs and creates new de-identified data files.
23+
* find_piis_in_unstructed_text.py: Script used by PII_data_processor to particularly detect piis in unstructured text
24+
25+
### Other utility files
2126
* restricted_words.py: Script to get restricted words for PII identification
22-
* app_frontend.py: App frontend, using python tkinter.
27+
* constant_strings.py: Declares strings used across app.
28+
* query_google_answer_boxes.py: Script to query locations and populations
2329
* dist folder: Contains .exe file for execution
24-
25-
In addition, a small app to find PIIs in unstructured text is offered
26-
* find_piis_in_unstructured_text_backend.py
27-
* find_piis_in_unstructured_text_frontend.py
30+
* hook-spacy.py: Dependency file needed when creating .exe
2831

2932
### Help and Support
3033

@@ -47,6 +50,4 @@ J-PAL: PII-Scan. 2017. https://github.com/J-PAL/PII-Scan
4750
The PII script is [MIT Licensed](https://github.com/PovertyAction/PII_detection/blob/master/LICENSE).
4851

4952
### To create .exe from source file
50-
`pyinstaller --onefile --windowed --icon=app.ico --add-data="app.ico;." --add-data="ipa_logo.jpg;." app_frontend.py`
51-
52-
53+
`pyinstaller --onefile --windowed --icon=app.ico --add-data="app.ico;." --add-data="ipa_logo.jpg;." --additional-hooks-dir=. --hiddenimport srsly.msgpack.util app_frontend.py`

app_frontend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
intro_text = "This script is meant to assist in the detection of PII (personally identifiable information) and subsequent removal from a dataset. This is an alpha program, not fully tested yet."
1515
intro_text_p2 = "You will first load a dataset that might contain PII variables. The system will try to identify the PII candidates. Please indicate if you would like to Drop, Encode or Keep them to then generate a new de-identified dataset."#, built without access to datasets containing PII on which to test or train it. Please help improve the program by filling out the survey on your experience using it (Help -> Provide Feedback)."
16-
app_title = "IPA's PII Detector - v0.2.11"
16+
app_title = "IPA's PII Detector - v0.2.12"
1717

1818
window_width = 1086
1919
window_height = 666
Binary file not shown.

find_piis_in_unstructured_text.py

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
from PII_data_processor import column_has_sufficiently_sparse_strings, clean_column, import_file, export
2+
from constant_strings import *
3+
import restricted_words as restricted_words_list
4+
import query_google_answer_boxes as google
5+
import requests
6+
from secret_keys import get_forebears_api_key
7+
import json
8+
from datetime import datetime
9+
import spacy
10+
11+
def get_stopwords(languages=None):
12+
13+
from os import listdir
14+
from os.path import isfile, join
15+
16+
stopwords_path = './stopwords/'
17+
18+
#If no language selected, get all stopwords
19+
if(languages == None):
20+
stopwords_files = [join(stopwords_path, f) for f in listdir(stopwords_path) if isfile(join(stopwords_path, f))]
21+
else: #Select only stopwords files for given languages
22+
stopwords_files = [join(stopwords_path, language) for language in languages if isfile(join(stopwords_path, language))]
23+
24+
stopwords_list = []
25+
for file_path in stopwords_files:
26+
with open(file_path, 'r', encoding="utf-8") as reader:
27+
stopwords = reader.read().split('\n')
28+
stopwords_list.extend(stopwords)
29+
30+
return list(set(stopwords_list))
31+
32+
def remove_stopwords(strings_list, languages=['english','spanish']):
33+
import stopwords
34+
stop_words = get_stopwords(languages)
35+
strings_list = [s for s in list(strings_list) if not s in stop_words]
36+
return strings_list
37+
38+
def find_phone_numbers_in_list_strings(list_strings):
39+
40+
phone_n_regex_str = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
41+
import re
42+
phone_n_regex = re.compile(phone_n_regex_str)
43+
phone_numbers_found = list(filter(phone_n_regex.match, list_strings))
44+
45+
return phone_numbers_found
46+
47+
48+
def generate_names_parameter_for_api(list_names, option):
49+
#According to https://forebears.io/onograph/documentation/api/location/batch
50+
51+
list_of_names_json=[]
52+
for name in list_names:
53+
list_of_names_json.append('{"name":"'+name+'","type":"'+option+'","limit":1}')
54+
55+
names_parameter = '['+','.join(list_of_names_json)+']'
56+
return names_parameter
57+
58+
def get_names_from_json_response(response):
59+
60+
names_found = []
61+
62+
json_response = json.loads(response)
63+
for result in json_response["results"]:
64+
#Names that exist come with the field 'jurisdictions'
65+
if('jurisdictions' in result):
66+
names_found.append(result['name'])
67+
# else:
68+
# print(result['name']+" is not a name")
69+
70+
return names_found
71+
72+
def filter_based_type_of_word(list_strings, language):
73+
74+
if language == SPANISH:
75+
nlp = spacy.load("es_core_news_sm")
76+
77+
else:
78+
nlp = spacy.load("en_core_web_sm")
79+
80+
81+
#Accepted types of words
82+
#Reference https://spacy.io/api/annotation#pos-tagging
83+
accepted_types = ['PROPN', 'X','PER','LOC','ORG','MISC','']
84+
85+
filtered_list = []
86+
for string in list_strings:
87+
doc = nlp(string)
88+
for token in doc:
89+
if token.pos_ in accepted_types:
90+
filtered_list.append(token.text)
91+
92+
return list(set(filtered_list))
93+
94+
def find_names_in_list_string(list_potential_names):
95+
'''
96+
Uses https://forebears.io/onograph/documentation/api/location/batch to find names in list_potential_names
97+
98+
If this approach seems to be slow or inaccurate, an alternative its to use spacy:
99+
import spacy
100+
string = "my name is felipe"
101+
nlp = spacy.load("en_core_web_md")
102+
doc = nlp(string)
103+
for token in doc:
104+
if (token.ent_type_ == 'PERSON')
105+
print(token+" is a name")
106+
'''
107+
API_KEY = get_forebears_api_key()
108+
109+
all_names_found = set()
110+
111+
#Api calls must query at most 1,000 names.
112+
n = 1000
113+
list_of_list_1000_potential_names = [list_potential_names[i:i + n] for i in range(0, len(list_potential_names), n)]
114+
115+
for list_1000_potential_names in list_of_list_1000_potential_names:
116+
#Need to 2 to API calls, one checking forenames and one checking surnames
117+
for forename_or_surname in ['forename', 'surname']:
118+
api_url = 'https://ono.4b.rs/v1/jurs?key='+API_KEY
119+
120+
names_parameter = generate_names_parameter_for_api(list_1000_potential_names, forename_or_surname)
121+
122+
response = requests.post(api_url, data={'names':names_parameter})
123+
124+
names_found = get_names_from_json_response(response.text)
125+
for name in names_found:
126+
all_names_found.add(name)
127+
128+
return list(all_names_found)
129+
130+
131+
def get_list_unique_strings_in_dataset(dataset, columns_to_check):
132+
#To make the list, we will go over all columns that have sparse strings
133+
set_string_in_dataset = set()
134+
135+
#For every column in the dataset
136+
for column_name in columns_to_check:
137+
#If column contains strings
138+
if(column_has_sufficiently_sparse_strings(dataset, column_name)):
139+
140+
#Clean column
141+
column = clean_column(dataset[column_name])
142+
143+
for row in column:
144+
#If row contains more than one word, add each word
145+
if (' ' in row):
146+
#For every word in the row
147+
for word in row.split(" "):
148+
#Add word to strings to check
149+
set_string_in_dataset.add(word)
150+
#If row does not contain spaces, add whole row (its only one string)
151+
else:
152+
set_string_in_dataset.add(row)
153+
154+
return list(set_string_in_dataset)
155+
156+
def find_piis(dataset, label_dict, columns_to_check_not_filtered, language):
157+
158+
#Filter columns to those that have sparse entries
159+
columns_to_check = []
160+
for column_name in columns_to_check_not_filtered:
161+
if column_has_sufficiently_sparse_strings(dataset, column_name):
162+
columns_to_check.append(column_name)
163+
164+
print("columns_to_check")
165+
print(columns_to_check)
166+
167+
#Do not check surveyCTO columns
168+
#columns_to_check = [column for column in dataset.columns if column not in restricted_words_list.get_surveycto_restricted_vars()]
169+
170+
#First we will make a list of all strings that need to be checked
171+
print("->Getting list of unique strings in dataset...")
172+
strings_to_check = get_list_unique_strings_in_dataset(dataset, columns_to_check)
173+
174+
#Remove string with less than 3 chars - piis should be longer than that
175+
print("->Removing strings with less than 3 characters")
176+
strings_to_check = [s for s in strings_to_check if len(s)>2]
177+
178+
#Find all telephone numbers
179+
print("-->Finding phone numbers")
180+
phone_numbers_found = find_phone_numbers_in_list_strings(strings_to_check)
181+
print("found "+str(len(phone_numbers_found)))
182+
183+
#Update strings_to_check
184+
strings_to_check = [s for s in strings_to_check if s not in phone_numbers_found]
185+
186+
#Clean list of words, now that we have already found numbers
187+
print("Length of list "+str(len(strings_to_check)))
188+
print("->Removing stopwords")
189+
strings_to_check = remove_stopwords(strings_to_check)
190+
print("->Filtering based on word type")
191+
strings_to_check = filter_based_type_of_word(strings_to_check, language)
192+
print("Length of list "+str(len(strings_to_check)))
193+
194+
#Find all names
195+
print("->Finding names")
196+
names_found = find_names_in_list_string(strings_to_check)
197+
print("found "+str(len(names_found)))
198+
print(names_found)
199+
#Update strings_to_check
200+
strings_to_check = [s for s in strings_to_check if s not in names_found]
201+
202+
#Find all locations with pop less than 20,000
203+
print("-->Finding locations with low population")
204+
locations_with_low_population_found = google.get_locations_with_low_population(strings_to_check)
205+
print("found "+str(len(locations_with_low_population_found)))
206+
print(locations_with_low_population_found)
207+
208+
return list(set(phone_numbers_found + names_found + locations_with_low_population_found))
209+
210+
211+
#Find piis in list
212+
print("->Findind PIIs")
213+
piis_found = find_piis_in_list_strings(filtered_strings_to_check)
214+
215+
# #Replace found piis found from the dataset
216+
# print("->Replacing PIIs in new dataset")
217+
# now = datetime.now()
218+
# current_time = now.strftime("%H:%M:%S")
219+
# print("Current Time =", current_time)
220+
# deidentified_dataset = dataset.replace(piis_found, 'XXXX', regex=True)
221+
222+
# #Save new dataframe
223+
# print("->Exporting new dataset")
224+
# now = datetime.now()
225+
# current_time = now.strftime("%H:%M:%S")
226+
# print("Current Time =", current_time)
227+
# new_file_path = export(deidentified_dataset, dataset_path)
228+
229+
print("Task ready!")
230+
231+
return piis_found
232+
233+
234+
if __name__ == "__main__":
235+
236+
dataset_path = 'X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw\RECOVR_MEX_r1_Raw.dta'
237+
238+
reading_status, reading_content = import_file(dataset_path)
239+
240+
if(reading_status is False):
241+
print("Problem importing file")
242+
243+
dataset = reading_content[DATASET]
244+
label_dict = reading_content[LABEL_DICT]
245+
246+
columns_to_check = [c for c in dataset.columns if c not in restricted_words_list.get_surveycto_restricted_vars()]
247+
248+
find_piis(dataset, label_dict, columns_to_check)
249+
250+
# print(find_names_in_list_string(['Felipe','nombrequenoexiste', 'George', 'Felipe', 'Enriqueta', 'dededede']))
251+

0 commit comments

Comments
 (0)