-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgennerdata.py
93 lines (74 loc) · 2.39 KB
/
gennerdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Generate NER data
"""
from dataclasses import dataclass
from numpy import random
from faker import Faker
import json
import sys
def load_sequences(filename):
"""Load JSON NE sequences.
:param filename: Name of JSON file to load
:type filename: String
:return: Deserialized JSON object
:rtype: Dictionary
"""
with open(filename, "r") as read_file:
data = json.load(read_file)
return data
def count_sequences(data):
"""Count the number of JSON NE sequences.
:param data: Dictionary of NE sequences
:type data: Dictionary
:return: Number of NE sequences
:rtype: Integer
"""
count = 0
try:
for value in data.items():
if isinstance(value, list):
count += len(value)
except:
print('Invalid NE sequences format')
return count
def gen_data(sequence,N):
"""Generate spaCy training data.
:param sequence: NE sequence
:type data: List
:return: spaCy training data
:rtype: List of tuples
"""
fake = Faker()
train_data =[]
for _ in range(N):
entities = []
for (offset, s) in enumerate(sequence):
if s['p'] is None:
try:
value = eval(s['value'])
except:
value = s['value']
else:
value = random.choice(s['value'], 1, p=s['p']).item(0)
if offset == 0:
start = 0
end = len(value)
else:
start = entities[offset-1]['end']
end = start + len(value)
label = s['label']
entities.append({'value': value, 'start': start, 'end': end, 'label': label})
data = (''.join([str(e['value']) for e in entities]),{'entities': [(e['start'],e['end'],e['label']) for e in entities if e['label'] is not None]})
train_data.append(data)
return train_data
def main():
"""Entry point.
"""
sequences = load_sequences(sys.argv[1])
training_data = []
for (offset, seq) in enumerate(sequences['ne-sequences']):
training_data.append(gen_data(seq['ne-sequence'],5))
for (offset, seq) in enumerate(sequences['ne-sequences']):
print(training_data[offset])
if __name__ == "__main__":
main()