20
20
LABEL_COL = "score"
21
21
22
22
23
- def load_pandas_df (
24
- local_cache_path = None , file_split = Split .TRAIN , file_type = "txt" , nrows = None
25
- ):
23
+ def load_pandas_df (local_cache_path = None , file_split = Split .TRAIN , file_type = "txt" , nrows = None ):
26
24
"""
27
25
Loads the SNLI dataset as pd.DataFrame
28
26
Download the dataset from "https://nlp.stanford.edu/projects/snli/snli_1.0.zip", unzip, and load
29
27
30
28
Args:
31
29
local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
32
- If None, all the intermediate files will be stored in a temporary directory and removed
33
- after use.
30
+ If None, all the intermediate files will be stored in a temporary directory and removed
31
+ after use.
34
32
file_split (str): File split to load, defaults to "train"
35
33
file_type (str): File type to load, defaults to "txt"
36
34
nrows (int): Number of rows to load, defaults to None (in which all rows will be returned)
@@ -78,12 +76,8 @@ def _maybe_download_and_extract(zip_path, file_split, file_type):
78
76
extract_path = os .path .join (dir_path , file_name )
79
77
80
78
if not os .path .exists (extract_path ):
81
- dpath = download_snli (zip_path )
82
- extract_snli (
83
- zip_path ,
84
- source_path = SNLI_DIRNAME + "/" + file_name ,
85
- dest_path = extract_path ,
86
- )
79
+ _ = download_snli (zip_path )
80
+ extract_snli (zip_path , source_path = SNLI_DIRNAME + "/" + file_name , dest_path = extract_path )
87
81
88
82
return extract_path
89
83
@@ -143,24 +137,20 @@ def clean_cols(df):
143
137
)
144
138
145
139
snli_df = snli_df .rename (
146
- columns = {
147
- "sentence1" : S1_COL ,
148
- "sentence2" : S2_COL ,
149
- "gold_label" : LABEL_COL ,
150
- }
140
+ columns = {"sentence1" : S1_COL , "sentence2" : S2_COL , "gold_label" : LABEL_COL }
151
141
)
152
142
153
143
return snli_df
154
144
155
145
156
146
def clean_rows (df , label_col = LABEL_COL ):
157
147
"""Drop badly formatted rows from the input dataframe
158
-
148
+
159
149
Args:
160
150
df (pd.DataFrame): Input dataframe
161
151
label_col (str): Name of label column.
162
- Defaults to the standardized column name that is set after running the clean_col method.
163
-
152
+ Defaults to the standardized column name that is set after running the clean_col method.
153
+
164
154
Returns:
165
155
pd.DataFrame
166
156
"""
@@ -169,23 +159,23 @@ def clean_rows(df, label_col=LABEL_COL):
169
159
170
160
return snli_df
171
161
162
+
172
163
def clean_df (df , label_col = LABEL_COL ):
173
164
df = clean_cols (df )
174
165
df = clean_rows (df , label_col )
175
166
176
167
return df
177
168
178
- def load_azureml_df (
179
- local_cache_path = None , file_split = Split .TRAIN , file_type = "txt"
180
- ):
169
+
170
+ def load_azureml_df (local_cache_path = None , file_split = Split .TRAIN , file_type = "txt" ):
181
171
"""
182
172
Loads the SNLI dataset as AzureML dataflow object
183
173
Download the dataset from "https://nlp.stanford.edu/projects/snli/snli_1.0.zip", unzip, and load.
184
174
185
175
Args:
186
176
local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
187
- If None, all the intermediate files will be stored in a temporary directory and removed
188
- after use.
177
+ If None, all the intermediate files will be stored in a temporary directory and removed
178
+ after use.
189
179
file_split (str): File split to load. One of (dev, test, train)
190
180
file_type (str): File type to load. One of (txt, jsonl)
191
181
0 commit comments