-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper1.py
164 lines (129 loc) · 5.13 KB
/
helper1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
This Helper One script contains the functions to prepare the [IMDB] data
It contains data download, split data set and data decode. More specifically:
get_dataset, prepare_imdb, prepare_word_dic, index_word, decode_review
"""
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import tensorflow as tf
'''
[DOWNLOADER] & [SLICER]
download the data and prepare it to be used
return train_data, train_labels, test_data and test_labels
'''
# downloader
def get_dataset(name):
'''
This function download the dataset.
:param name: determine which dataset to be downloaded.
:return: the dataset
'''
from tensorflow import keras
if name == 'imdb'or'IMDB':
dataset = keras.datasets.imdb
elif name == 'fashion'or'Fashion':
dataset = keras.datasets.fashion_mnist
else:
print('There is no dataset named {} has been downloaded \n.'.format(name))
print("The {} dataset has been downloaded successfully. \n".format(name))
return dataset
# slicer
def prepare_imdb(dataset, cross_val=False, partial=False):
'''
This function will prepare the dataset for training and testing.
If cross validation is True then it will divided the training set into validate set and partial set.
This is used to cross valid the model while not using the testing set.
It is default to be False.
:param dataset: the dataset to be prepared.
:param cross_val: decide whether to divided the training set.
:param partial: decided whether output all the dataset or only part or them.
:return:
'''
data = dataset
# split the dataset and drop the rare words
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000)
print("The dataset has been split into training-set and testing-set successfully. \n")
# transfer into same length
word_index = data.get_word_index()
word_index = prepare_word_dic(word_index)
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_data = pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256)
test_data = pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256)
if cross_val:
# creat testing set, instead of using the testing set
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]
print("The testing set has been created from training-set. \n")
if partial:
return (x_val, partial_x_train), (y_val, partial_y_train)
else:
return (train_data, train_labels), (test_data, test_labels), (x_val, partial_x_train), (y_val, partial_y_train)
else:
return (train_data, train_labels), (test_data, test_labels)
'''
[WORD INDEX]
get the reverse word index
and prepare the first 3 reserved keys
'''
# filling the first three reservation key
def prepare_word_dic(word_index):
'''
This function will handle with the first reservation keys, otherwise directly translate will looks strange
:param word_index: the word_index dictionary of the dataset
:return: filling the first few reservation keys with PAD, START, UNK and UNUSED.
'''
word_index = {key: (value + 3) for key, value in word_index.items()}
word_index["<PAD>"] = 0 # filling
word_index["<START>"] = 1
word_index["<UNK>"] = 2 # unknown
word_index["<UNUSED>"] = 3 # unused
return word_index
# get the word via its key
def index_word(dataset):
'''
This function is to revise the word_index in order to use it to translate the integer data.
It first need to get the word_index dictionary
and then handle with the first reservation keys
Finally it will revise the word_index into index_word
:param dataset: the dataset used here
:return: return the reversed word index dictionary
'''
data = dataset
word_index = data.get_word_index()
word_index = prepare_word_dic(word_index)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
return reverse_word_index
'''
[DECODER]
decode the data into sentence
'''
def decode_review(text, index_word, show=False):
'''
This function will decode the integer data into readable sentence.
:param text: the integer data need to be decoded (translate)
:param index_word: the index to word dictionary we get above
:param show: decided whether to print out the translated sentence
:return: the readable sentence
'''
index_word = index_word
sentence = ' '
sentence = sentence.join([index_word.get(i, '?') for i in text])
if show:
print(sentence)
return sentence
'''
test this script
'''
if __name__ == '__main__':
dataset = get_dataset('imdb')
(train_data, train_labels), (test_data, test_labels) = prepare_imdb(dataset)
reverse_word_index = index_word(dataset)
# show if things went well
print('The first instance in the dataset is:')
print(train_data[0])
print()
print('Decode the first instance into same-length sentence:')
decode_review(train_data[0], reverse_word_index, True)