This repository was archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 919
/
Copy pathmultinli.py
272 lines (224 loc) · 10.2 KB
/
multinli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Utility functions for downloading, extracting, and reading the
Multi-Genre NLI (MultiNLI) Corpus.
https://www.nyu.edu/projects/bowman/multinli/
"""
import logging
import os
from tempfile import TemporaryDirectory
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.data_loaders import DaskJSONLoader
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.sequence_classification import Processor
URL = "http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip"
# Source - https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py
URL_JIANT_MNLI_TSV = "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce"
DATA_FILES = {
"train": "multinli_1.0/multinli_1.0_train.jsonl",
"dev_matched": "multinli_1.0/multinli_1.0_dev_matched.jsonl",
"dev_mismatched": "multinli_1.0/multinli_1.0_dev_mismatched.jsonl",
}
def download_file_and_extract(
local_cache_path: str = ".", file_split: str = "train"
) -> None:
"""Download and extract the dataset files
Args:
local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."})
file_split {str} -- [description] (default: {"train"})
Returns:
None -- Nothing is returned
"""
file_name = URL.split("/")[-1]
maybe_download(URL, file_name, local_cache_path)
if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])):
extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)
def download_tsv_files_and_extract(local_cache_path: str = ".") -> None:
"""Download and extract the dataset files in tsv format from NYU Jiant
downloads both original and tsv formatted data.
Args:
local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."})
Returns:
None -- Nothing is returned
"""
try:
folder_name = "MNLI"
file_name = f"{folder_name}.zip"
maybe_download(URL_JIANT_MNLI_TSV, file_name, local_cache_path)
if not os.path.exists(os.path.join(local_cache_path, folder_name)):
extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)
# Clean up zip download
if os.path.exists(os.path.join(local_cache_path, file_name)):
os.remove(os.path.join(local_cache_path, file_name))
except IOError as e:
raise (e)
print("Downloaded file to: ", os.path.join(local_cache_path, folder_name))
def load_pandas_df(local_cache_path=".", file_split="train"):
"""Loads extracted dataset into pandas
Args:
local_cache_path ([type], optional): [description]. Defaults to current working directory.
file_split (str, optional): The subset to load.
One of: {"train", "dev_matched", "dev_mismatched"}
Defaults to "train".
Returns:
pd.DataFrame: pandas DataFrame containing the specified
MultiNLI subset.
"""
try:
download_file_and_extract(local_cache_path, file_split)
except Exception as e:
raise e
return pd.read_json(
os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True
)
def get_generator(
local_cache_path=".",
file_split="train",
block_size=10e6,
batch_size=10e6,
num_batches=None,
):
""" Returns an extracted dataset as a random batch generator that
yields pandas dataframes.
Args:
local_cache_path ([type], optional): [description]. Defaults to None.
file_split (str, optional): The subset to load.
One of: {"train", "dev_matched", "dev_mismatched"}
Defaults to "train".
block_size (int, optional): Size of partition in bytes.
num_batches (int): Number of batches to generate.
batch_size (int]): Batch size.
Returns:
Generator[pd.Dataframe, None, None] : Random batch generator that yields pandas dataframes.
"""
try:
download_file_and_extract(local_cache_path, file_split)
except Exception as e:
raise e
loader = DaskJSONLoader(
os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
)
return loader.get_sequential_batches(
batch_size=int(batch_size), num_batches=num_batches
)
def load_tc_dataset(
local_path=TemporaryDirectory().name,
test_fraction=0.25,
random_seed=None,
train_sample_ratio=1.0,
test_sample_ratio=1.0,
model_name="bert-base-uncased",
to_lower=True,
cache_dir=TemporaryDirectory().name,
max_len=MAX_SEQ_LEN,
batch_size=32,
num_gpus=None,
):
"""
Load the multinli dataset and split into training and testing datasets.
The datasets are preprocessed and can be used to train a NER model or evaluate
on the testing dataset.
Args:
local_path (str, optional): The local file path to save the raw wikigold file.
Defautls to TemporaryDirectory().name.
test_fraction (float, optional): The fraction of testing dataset when splitting.
Defaults to 0.25.
random_seed (float, optional): Random seed used to shuffle the data.
Defaults to None.
train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.
Defaults to 1.0.
test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.
Defaults to 1.0.
model_name (str, optional): The pretained model name.
Defaults to "bert-base-uncased".
to_lower (bool, optional): Lower case text input.
Defaults to True.
cache_dir (str, optional): The default folder for saving cache files.
Defaults to TemporaryDirectory().name.
max_len (int, optional): Maximum length of the list of tokens. Lists longer
than this are truncated and shorter ones are padded with "O"s.
Default value is BERT_MAX_LEN=512.
batch_size (int, optional): The batch size for training and testing.
Defaults to 32.
num_gpus (int, optional): The number of GPUs.
Defaults to None.
Returns:
tuple. The tuple contains four elements:
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
can be retrieved by calling the `inverse_transform` function.
test_labels (Series): a Pandas Series of testing label (in label ID format). If
the labels are in raw label values format, we will need to transform it to
label IDs by using the label_encoder.transform function.
"""
# download and load the original dataset
all_df = load_pandas_df(local_cache_path=local_path, file_split="train")
# select the examples corresponding to one of the entailment labels (neutral
# in this case) to avoid duplicate rows, as the sentences are not unique,
# whereas the sentence pairs are.
all_df = all_df[all_df["gold_label"] == "neutral"]
text_col = "sentence1"
label_col = "genre"
# encode labels, use the "genre" column as the label column
label_encoder = LabelEncoder()
label_encoder.fit(all_df[label_col])
if test_fraction < 0 or test_fraction >= 1.0:
logging.warning(
"Invalid test fraction value: {}, changed to 0.25".format(test_fraction)
)
test_fraction = 0.25
train_df, test_df = train_test_split(
all_df, train_size=(1.0 - test_fraction), random_state=random_seed
)
if train_sample_ratio > 1.0:
train_sample_ratio = 1.0
logging.warning("Setting the training sample ratio to 1.0")
elif train_sample_ratio < 0:
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError(
"Invalid training sample ration: {}".format(train_sample_ratio)
)
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
logging.warning("Setting the testing sample ratio to 1.0")
elif test_sample_ratio < 0:
logging.error("Invalid testing sample ration: {}".format(test_sample_ratio))
raise ValueError("Invalid testing sample ration: {}".format(test_sample_ratio))
if train_sample_ratio < 1.0:
train_df = train_df.sample(frac=train_sample_ratio).reset_index(drop=True)
if test_sample_ratio < 1.0:
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
train_labels = label_encoder.transform(train_df[label_col])
train_df[label_col] = train_labels
test_labels = label_encoder.transform(test_df[label_col])
test_df[label_col] = test_labels
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
train_dataset = processor.dataset_from_dataframe(
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
train_dataloader = dataloader_from_dataset(
train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True
)
test_dataset = processor.dataset_from_dataframe(
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
test_dataloader = dataloader_from_dataset(
test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False
)
return (train_dataloader, test_dataloader, label_encoder, test_labels)
def get_label_values(label_encoder, label_ids):
"""
Get the label values from label IDs.
Args:
label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance
label_ids (Numpy array): a Numpy array of label IDs.
Returns:
Numpy array. A Numpy array of label values.
"""
return label_encoder.inverse_transform(label_ids)