This repository was archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 919
/
Copy pathwikigold.py
224 lines (190 loc) · 8.59 KB
/
wikigold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Utility functions for downloading and reading the wikigold dataset for
Named Entity Recognition (NER).
https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data
"""
import logging
import os
import random
from tempfile import TemporaryDirectory
import pandas as pd
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.ner_utils import preprocess_conll
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.named_entity_recognition import (
TokenClassificationProcessor,
)
URL = (
"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
)
def load_train_test_dfs(local_cache_path="./", test_fraction=0.5, random_seed=None):
"""
Get the training and testing data frames based on test_fraction.
Args:
local_cache_path (str): Path to store the data. If the data file
doesn't exist in this path, it's downloaded.
test_fraction (float, optional): Fraction of data ot use for
testing. Since this is a small dataset, the default testing
fraction is set to 0.5
random_seed (float, optional): Random seed used to shuffle the data.
Returns:
tuple: (train_pandas_df, test_pandas_df), each data frame contains
two columns
"sentence": sentences in strings.
"labels": list of entity labels of the words in the sentence.
"""
file_name = URL.split("/")[-1]
maybe_download(URL, file_name, local_cache_path)
data_file = os.path.join(local_cache_path, file_name)
with open(data_file, "r", encoding="utf8") as file:
text = file.read()
sentence_list, labels_list = preprocess_conll(text)
if random_seed:
random.seed(random_seed)
sentence_and_labels = list(zip(sentence_list, labels_list))
random.shuffle(sentence_and_labels)
sentence_list[:], labels_list[:] = zip(*sentence_and_labels)
sentence_count = len(sentence_list)
test_sentence_count = round(sentence_count * test_fraction)
test_sentence_list = sentence_list[:test_sentence_count]
test_labels_list = labels_list[:test_sentence_count]
train_sentence_list = sentence_list[test_sentence_count:]
train_labels_list = labels_list[test_sentence_count:]
train_df = pd.DataFrame(
{"sentence": train_sentence_list, "labels": train_labels_list}
)
test_df = pd.DataFrame({"sentence": test_sentence_list, "labels": test_labels_list})
return (train_df, test_df)
def get_unique_labels():
"""Get the unique labels in the wikigold dataset."""
return ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG"]
def load_dataset(
local_path=TemporaryDirectory().name,
test_fraction=0.3,
random_seed=None,
train_sample_ratio=1.0,
test_sample_ratio=1.0,
model_name="bert-base-uncased",
to_lower=True,
cache_dir=TemporaryDirectory().name,
max_len=MAX_SEQ_LEN,
trailing_piece_tag="X",
batch_size=32,
num_gpus=None,
):
"""
Load the wikigold dataset and split into training and testing datasets.
The datasets are preprocessed and can be used to train a NER model or evaluate
on the testing dataset.
Args:
local_path (str, optional): The local file path to save the raw wikigold file.
Defautls to "~/.nlp_utils/datasets/".
test_fraction (float, optional): The fraction of testing dataset when splitting.
Defaults to 0.3.
random_seed (float, optional): Random seed used to shuffle the data.
Defaults to None.
train_sample_ratio (float, optional): The ratio that used to sub-sampling for training.
Defaults to 1.0.
test_sample_ratio (float, optional): The ratio that used to sub-sampling for testing.
Defaults to 1.0.
model_name (str, optional): The pretained model name.
Defaults to "bert-base-uncased".
to_lower (bool, optional): Lower case text input.
Defaults to True.
cache_dir (str, optional): The default folder for saving cache files.
Defaults to './temp'.
max_len (int, optional): Maximum length of the list of tokens. Lists longer
than this are truncated and shorter ones are padded with "O"s.
Default value is BERT_MAX_LEN=512.
trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
For example, "criticize" is broken into "critic" and "##ize", "critic"
preserves its original label and "##ize" is labeled as trailing_piece_tag.
Default value is "X".
batch_size (int, optional): The batch size for training and testing.
Defaults to 32.
num_gpus (int, optional): The number of GPUs.
Defaults to None.
Returns:
tuple. The tuple contains four elements.
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
label_map (dict): A dictionary object to map a label (str) to an ID (int).
test_dataset (TensorDataset): A TensorDataset containing the following four tensors.
1. input_ids_all: Tensor. Each sublist contains numerical values,
i.e. token ids, corresponding to the tokens in the input text data.
2. input_mask_all: Tensor. Each sublist contains the attention
mask of the input token id list, 1 for input tokens and 0 for
padded tokens, so that padded tokens are not attended to.
3. trailing_token_mask_all: Tensor. Each sublist is
a boolean list, True for the first word piece of each
original word, False for the trailing word pieces,
e.g. "##ize". This mask is useful for removing the
predictions on trailing word pieces, so that each
original word in the input text has a unique predicted
label.
4. label_ids_all: Tensor, each sublist contains token labels of
a input sentence/paragraph, if labels is provided. If the
`labels` argument is not provided, it will not return this tensor.
"""
train_df, test_df = load_train_test_dfs(
local_cache_path=local_path,
test_fraction=test_fraction,
random_seed=random_seed,
)
if train_sample_ratio > 1.0:
train_sample_ratio = 1.0
logging.warning("Setting the training sample ratio to 1.0")
elif train_sample_ratio < 0:
logging.error("Invalid training sample ratio: {}".format(train_sample_ratio))
raise ValueError(
"Invalid training sample ratio: {}".format(train_sample_ratio)
)
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
logging.warning("Setting the testing sample ratio to 1.0")
elif test_sample_ratio < 0:
logging.error("Invalid testing sample ratio: {}".format(test_sample_ratio))
raise ValueError("Invalid testing sample ratio: {}".format(test_sample_ratio))
if train_sample_ratio < 1.0:
train_df = train_df.sample(frac=train_sample_ratio).reset_index(drop=True)
if test_sample_ratio < 1.0:
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
processor = TokenClassificationProcessor(
model_name=model_name, to_lower=to_lower, cache_dir=cache_dir
)
label_map = TokenClassificationProcessor.create_label_map(
label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
)
train_dataset = processor.preprocess(
text=train_df["sentence"],
max_len=max_len,
labels=train_df["labels"],
label_map=label_map,
trailing_piece_tag=trailing_piece_tag,
)
test_dataset = processor.preprocess(
text=test_df["sentence"],
max_len=max_len,
labels=test_df["labels"],
label_map=label_map,
trailing_piece_tag=trailing_piece_tag,
)
train_dataloader = dataloader_from_dataset(
train_dataset,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=True,
distributed=False,
)
test_dataloader = dataloader_from_dataset(
test_dataset,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=False,
distributed=False,
)
return (train_dataloader, test_dataloader, label_map, test_dataset)