26
26
def get_lines ():
27
27
id2line = {}
28
28
file_path = os .path .join (config .DATA_PATH , config .LINE_FILE )
29
+ print (config .LINE_FILE )
29
30
with open (file_path , 'r' , errors = 'ignore' ) as f :
30
- lines = f .readlines ()
31
- for i , line in enumerate (lines ):
32
- parts = line .split (' +++$+++ ' )
33
- if len (parts ) == 5 :
34
- if parts [4 ][- 1 ] == '\n ' :
35
- parts [4 ] = parts [4 ][:- 1 ]
36
- id2line [parts [0 ]] = parts [4 ]
31
+ # lines = f.readlines()
32
+ # for line in lines:
33
+ i = 0
34
+ try :
35
+ for line in f :
36
+ parts = line .split (' +++$+++ ' )
37
+ if len (parts ) == 5 :
38
+ if parts [4 ][- 1 ] == '\n ' :
39
+ parts [4 ] = parts [4 ][:- 1 ]
40
+ id2line [parts [0 ]] = parts [4 ]
41
+ i += 1
42
+ except UnicodeDecodeError :
43
+ print (i , line )
37
44
return id2line
38
45
39
46
def get_convos ():
@@ -54,64 +61,13 @@ def get_convos():
54
61
def question_answers (id2line , convos ):
55
62
""" Divide the dataset into two sets: questions and answers. """
56
63
questions , answers = [], []
57
- seen_questions , seen_answers = set (), set ()
58
- repeated = 0
59
64
for convo in convos :
60
65
for index , line in enumerate (convo [:- 1 ]):
61
- if not convo [index ] in id2line or not convo [index + 1 ] in id2line :
62
- continue
63
- q = id2line [convo [index ]]
64
- a = id2line [convo [index + 1 ]]
65
- # if q in seen_questions or a in seen_answers:
66
- if q in seen_questions :
67
- print ('Q:' , q )
68
- print ('A:' , a )
69
- repeated += 1
70
- continue
71
- questions .append (q )
72
- answers .append (a )
73
- seen_questions .add (q )
74
- # seen_answers.add(a)
66
+ questions .append (id2line [convo [index ]])
67
+ answers .append (id2line [convo [index + 1 ]])
75
68
assert len (questions ) == len (answers )
76
- print ('Total repeated:' , repeated )
77
69
return questions , answers
78
70
79
- def tokenize_helper (line ):
80
- tokens = basic_tokenizer (line )
81
- text = ' ' .join (tokens )
82
- for a , b in config .CONTRACTIONS :
83
- text = text .replace (a , b )
84
- return text
85
-
86
- def tokenize_data ():
87
- print ('Tokenizing the data ...' )
88
- # filenames = ['test.enc', 'test.dec', 'train.enc', 'train.dec']
89
- modes = ['train' , 'test' ]
90
- seen_questions = set ()
91
- for mode in modes :
92
- q_file = os .path .join (config .PROCESSED_PATH , mode + '.enc' )
93
- a_file = os .path .join (config .PROCESSED_PATH , mode + '.dec' )
94
- q_out = open (os .path .join (config .PROCESSED_PATH , mode + '.enc.tok' ), 'w' )
95
- a_out = open (os .path .join (config .PROCESSED_PATH , mode + '.dec.tok' ), 'w' )
96
-
97
- q_lines = open (q_file , 'r' ).readlines ()
98
- a_lines = open (a_file , 'r' ).readlines ()
99
- n = len (q_lines )
100
- repeated = 0
101
-
102
- for i in range (n ):
103
- q , a = q_lines [i ], a_lines [i ]
104
- q_clean = tokenize_helper (q )
105
- if q_clean in seen_questions :
106
- print (q_clean )
107
- repeated += 1
108
- continue
109
- seen_questions .add (q_clean )
110
- q_out .write (q_clean + '\n ' )
111
- a_clean = tokenize_helper (a )
112
- a_out .write (a_clean + '\n ' )
113
- print ('Total repeated in' , mode , ':' , repeated )
114
-
115
71
def prepare_dataset (questions , answers ):
116
72
# create path to store all the train & test encoder & decoder
117
73
make_dir (config .PROCESSED_PATH )
@@ -122,7 +78,7 @@ def prepare_dataset(questions, answers):
122
78
filenames = ['train.enc' , 'train.dec' , 'test.enc' , 'test.dec' ]
123
79
files = []
124
80
for filename in filenames :
125
- files .append (open (os .path .join (config .PROCESSED_PATH , filename ), 'w' ))
81
+ files .append (open (os .path .join (config .PROCESSED_PATH , filename ),'w' ))
126
82
127
83
for i in range (len (questions )):
128
84
if i in test_ids :
@@ -142,14 +98,13 @@ def make_dir(path):
142
98
except OSError :
143
99
pass
144
100
145
- def basic_tokenizer (line , normalize_digits = False ):
101
+ def basic_tokenizer (line , normalize_digits = True ):
146
102
""" A basic tokenizer to tokenize text into tokens.
147
103
Feel free to change this to suit your need. """
148
104
line = re .sub ('<u>' , '' , line )
149
105
line = re .sub ('</u>' , '' , line )
150
106
line = re .sub ('\[' , '' , line )
151
107
line = re .sub ('\]' , '' , line )
152
- line = line .replace ('`' , "'" )
153
108
words = []
154
109
_WORD_SPLIT = re .compile ("([.,!?\" '-<>:;)(])" )
155
110
_DIGIT_RE = re .compile (r"\d" )
@@ -162,15 +117,14 @@ def basic_tokenizer(line, normalize_digits=False):
162
117
words .append (token )
163
118
return words
164
119
165
- def build_vocab (filename , normalize_digits = False ):
120
+ def build_vocab (filename , normalize_digits = True ):
166
121
in_path = os .path .join (config .PROCESSED_PATH , filename )
167
- out_path = os .path .join (config .PROCESSED_PATH , 'vocab.{}' .format (filename [- 7 : - 4 ]))
122
+ out_path = os .path .join (config .PROCESSED_PATH , 'vocab.{}' .format (filename [- 3 : ]))
168
123
169
124
vocab = {}
170
125
with open (in_path , 'r' ) as f :
171
126
for line in f .readlines ():
172
- tokens = line .split ()
173
- for token in tokens :
127
+ for token in basic_tokenizer (line ):
174
128
if not token in vocab :
175
129
vocab [token ] = 0
176
130
vocab [token ] += 1
@@ -184,29 +138,29 @@ def build_vocab(filename, normalize_digits=False):
184
138
index = 4
185
139
for word in sorted_vocab :
186
140
if vocab [word ] < config .THRESHOLD :
187
- with open ('config.py' , 'a' ) as cf :
188
- if 'enc' in filename :
189
- cf .write ('ENC_VOCAB = ' + str (index ) + '\n ' )
190
- else :
191
- cf .write ('DEC_VOCAB = ' + str (index ) + '\n ' )
192
141
break
193
142
f .write (word + '\n ' )
194
143
index += 1
144
+ with open ('config.py' , 'a' ) as cf :
145
+ if filename [- 3 :] == 'enc' :
146
+ cf .write ('ENC_VOCAB = ' + str (index ) + '\n ' )
147
+ else :
148
+ cf .write ('DEC_VOCAB = ' + str (index ) + '\n ' )
195
149
196
150
def load_vocab (vocab_path ):
197
151
with open (vocab_path , 'r' ) as f :
198
152
words = f .read ().splitlines ()
199
153
return words , {words [i ]: i for i in range (len (words ))}
200
154
201
155
def sentence2id (vocab , line ):
202
- return [vocab .get (token , vocab ['<unk>' ]) for token in line ]
156
+ return [vocab .get (token , vocab ['<unk>' ]) for token in basic_tokenizer ( line ) ]
203
157
204
158
def token2id (data , mode ):
205
159
""" Convert all the tokens in the data into their corresponding
206
160
index in the vocabulary. """
207
161
vocab_path = 'vocab.' + mode
208
- in_path = data + '.' + mode + '.tok'
209
- out_path = data + '.' + mode + '.ids'
162
+ in_path = data + '.' + mode
163
+ out_path = data + '_ids .' + mode
210
164
211
165
_ , vocab = load_vocab (os .path .join (config .PROCESSED_PATH , vocab_path ))
212
166
in_file = open (os .path .join (config .PROCESSED_PATH , in_path ), 'r' )
@@ -233,8 +187,8 @@ def prepare_raw_data():
233
187
234
188
def process_data ():
235
189
print ('Preparing data to be model-ready ...' )
236
- build_vocab ('train.enc.tok ' )
237
- build_vocab ('train.dec.tok ' )
190
+ build_vocab ('train.enc' )
191
+ build_vocab ('train.dec' )
238
192
token2id ('train' , 'enc' )
239
193
token2id ('train' , 'dec' )
240
194
token2id ('test' , 'enc' )
@@ -304,5 +258,4 @@ def get_batch(data_bucket, bucket_id, batch_size=1):
304
258
305
259
if __name__ == '__main__' :
306
260
prepare_raw_data ()
307
- tokenize_data ()
308
261
process_data ()
0 commit comments