This repository was archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 919
/
Copy pathpreprocess.py
167 lines (137 loc) · 5.48 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Utility functions for common text preprocessing tasks like converting to
lower case, removing stop words, convert to unicode, etc.
"""
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
def to_lowercase_all(df):
"""
This function transforms all strings in the dataframe to lowercase
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
Returns:
pd.DataFrame: Dataframe with lowercase standardization.
"""
return df.applymap(lambda s: s.lower() if type(s) == str else s)
def to_lowercase(df, column_names=[]):
"""
This function transforms strings of the column names in the dataframe
passed to lowercase
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
column_names(list, optional): column names to be changed to lowercase.
Returns:
pd.DataFrame: Dataframe with columns with lowercase standardization.
"""
if not column_names:
return to_lowercase_all(df)
else:
df[column_names] = df[column_names].applymap(lambda s: s.lower() if type(s) == str else s)
return df
def to_spacy_tokens(
df,
sentence_cols=["sentence1", "sentence2"],
token_cols=["sentence1_tokens", "sentence2_tokens"],
):
"""
This function tokenizes the sentence pairs using spaCy, defaulting to the
spaCy en_core_web_sm model
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names of the raw sentence pairs.
token_cols (list, optional): Column names for the tokenized sentences.
Returns:
pd.DataFrame: Dataframe with new columns token_cols, each containing
a list of tokens for their respective sentences.
"""
nlp = spacy.load("en_core_web_sm")
text_df = df[sentence_cols]
nlp_df = text_df.applymap(lambda x: nlp(x))
tok_df = nlp_df.applymap(lambda doc: [token.text for token in doc])
tok_df.columns = token_cols
tokenized = pd.concat([df, tok_df], axis=1)
return tokenized
def rm_spacy_stopwords(
df,
sentence_cols=["sentence1", "sentence2"],
stop_cols=["sentence1_tokens_rm_stopwords", "sentence2_tokens_rm_stopwords"],
custom_stopwords=[],
):
"""
This function tokenizes the sentence pairs using spaCy and remove
stopwords, defaulting to the spaCy en_core_web_sm model
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw sentence
pairs.
stop_cols (list, optional): Column names for the tokenized sentences
without stop words.
custom_stopwords (list of str, optional): List of custom stopwords to
register with the spaCy model.
Returns:
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
list of tokens for their respective sentences.
"""
nlp = spacy.load("en_core_web_sm")
if len(custom_stopwords) > 0:
for csw in custom_stopwords:
nlp.vocab[csw].is_stop = True
text_df = df[sentence_cols]
nlp_df = text_df.applymap(lambda x: nlp(x))
stop_df = nlp_df.applymap(lambda doc: [token.text for token in doc if not token.is_stop])
stop_df.columns = stop_cols
return pd.concat([df, stop_df], axis=1)
def to_nltk_tokens(
df,
sentence_cols=["sentence1", "sentence2"],
token_cols=["sentence1_tokens", "sentence2_tokens"],
):
"""
This function converts a sentence to word tokens using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw sentences.
token_cols (list, optional): Column names for the tokenized sentences.
Returns:
pd.DataFrame: Dataframe with new columns token_cols, each containing a
list of tokens for their respective sentences.
"""
text_df = df[sentence_cols]
tok_df = text_df.applymap(lambda sentence: nltk.word_tokenize(sentence))
tok_df.columns = token_cols
tokenized = pd.concat([df, tok_df], axis=1)
return tokenized
def rm_nltk_stopwords(
df,
sentence_cols=["sentence1", "sentence2"],
stop_cols=["sentence1_tokens_rm_stopwords", "sentence2_tokens_rm_stopwords"],
):
"""
This function removes stop words from a sentence using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw entences.
stop_cols (list, optional): Column names for the tokenized sentences
without stop words.
Returns:
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
list of tokens for their respective sentences.
"""
nltk.download("stopwords")
stop_words = tuple(stopwords.words("english"))
text_df = df[sentence_cols]
stop_df = text_df.applymap(lambda sentence: nltk.word_tokenize(sentence)).applymap(
lambda l: [word for word in l if word not in stop_words]
)
stop_df.columns = stop_cols
return pd.concat([df, stop_df], axis=1)
def convert_to_unicode(input_text, encoding="utf-8"):
"""Converts intput_text to Unicode. Input must be utf-8."""
if isinstance(input_text, str):
return input_text
elif isinstance(input_text, bytes):
return input_text.decode(encoding, "ignore")