From 1b60a53c7efb2dbdebf0777e69c84a289f9ab6f5 Mon Sep 17 00:00:00 2001 From: Xpitfire Date: Wed, 20 Sep 2017 17:37:29 +0200 Subject: [PATCH 1/4] Fixed byte and str operations for the data preprocessing --- assignments/chatbot/data.py | 59 +++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/assignments/chatbot/data.py b/assignments/chatbot/data.py index 673087d7..9d4b3db7 100644 --- a/assignments/chatbot/data.py +++ b/assignments/chatbot/data.py @@ -1,7 +1,7 @@ """ A neural chatbot using sequence to sequence model with -attentional decoder. +attentional decoder. -This is based on Google Translate Tensorflow model +This is based on Google Translate Tensorflow model https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/ Sequence to sequence model by Cho et al.(2014) @@ -31,7 +31,8 @@ def get_lines(): with open(file_path, 'rb') as f: lines = f.readlines() for line in lines: - parts = line.split(' +++$+++ ') + print(line) + parts = line.split(b' +++$+++ ') if len(parts) == 5: if parts[4][-1] == '\n': parts[4] = parts[4][:-1] @@ -44,10 +45,10 @@ def get_convos(): convos = [] with open(file_path, 'rb') as f: for line in f.readlines(): - parts = line.split(' +++$+++ ') + parts = line.split(b' +++$+++ ') if len(parts) == 4: convo = [] - for line in parts[3][1:-2].split(', '): + for line in parts[3][1:-2].split(b', '): convo.append(line[1:-1]) convos.append(convo) @@ -66,10 +67,10 @@ def question_answers(id2line, convos): def prepare_dataset(questions, answers): # create path to store all the train & test encoder & decoder make_dir(config.PROCESSED_PATH) - + # random convos to create the test set test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE) - + filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec'] files = [] for filename in filenames: @@ -77,11 +78,11 @@ def prepare_dataset(questions, answers): for i in range(len(questions)): if i in test_ids: - files[2].write(questions[i] + '\n') - files[3].write(answers[i] + '\n') + files[2].write(questions[i] + b'\n') + files[3].write(answers[i] + b'\n') else: - files[0].write(questions[i] + '\n') - files[1].write(answers[i] + '\n') + files[0].write(questions[i] + b'\n') + files[1].write(answers[i] + b'\n') for file in files: file.close() @@ -96,13 +97,13 @@ def make_dir(path): def basic_tokenizer(line, normalize_digits=True): """ A basic tokenizer to tokenize text into tokens. Feel free to change this to suit your need. """ - line = re.sub('', '', line) - line = re.sub('', '', line) - line = re.sub('\[', '', line) - line = re.sub('\]', '', line) + line = re.sub(b'', b'', line) + line = re.sub(b'', b'', line) + line = re.sub(b'\[', b'', line) + line = re.sub(b'\]', b'', line) words = [] _WORD_SPLIT = re.compile(b"([.,!?\"'-<>:;)(])") - _DIGIT_RE = re.compile(r"\d") + _DIGIT_RE = re.compile(b"\d") for fragment in line.strip().lower().split(): for token in re.split(_WORD_SPLIT, fragment): if not token: @@ -126,20 +127,20 @@ def build_vocab(filename, normalize_digits=True): sorted_vocab = sorted(vocab, key=vocab.get, reverse=True) with open(out_path, 'wb') as f: - f.write('' + '\n') - f.write('' + '\n') - f.write('' + '\n') - f.write('<\s>' + '\n') + f.write(b'' + b'\n') + f.write(b'' + b'\n') + f.write(b'' + b'\n') + f.write(b'<\s>' + b'\n') index = 4 for word in sorted_vocab: if vocab[word] < config.THRESHOLD: with open('config.py', 'ab') as cf: if filename[-3:] == 'enc': - cf.write('ENC_VOCAB = ' + str(index) + '\n') + cf.write(b'ENC_VOCAB = ' + str.encode(str(index)) + b'\n') else: - cf.write('DEC_VOCAB = ' + str(index) + '\n') + cf.write(b'DEC_VOCAB = ' + str.encode(str(index)) + b'\n') break - f.write(word + '\n') + f.write(word + b'\n') index += 1 def load_vocab(vocab_path): @@ -148,7 +149,7 @@ def load_vocab(vocab_path): return words, {words[i]: i for i in range(len(words))} def sentence2id(vocab, line): - return [vocab.get(token, vocab['']) for token in basic_tokenizer(line)] + return [vocab.get(token, vocab[b'']) for token in basic_tokenizer(line)] def token2id(data, mode): """ Convert all the tokens in the data into their corresponding @@ -160,18 +161,18 @@ def token2id(data, mode): _, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path)) in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'rb') out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'wb') - + lines = in_file.read().splitlines() for line in lines: if mode == 'dec': # we only care about '' and in encoder - ids = [vocab['']] + ids = [vocab[b'']] else: ids = [] ids.extend(sentence2id(vocab, line)) # ids.extend([vocab.get(token, vocab['']) for token in basic_tokenizer(line)]) if mode == 'dec': - ids.append(vocab['<\s>']) - out_file.write(' '.join(str(id_) for id_ in ids) + '\n') + ids.append(vocab[b'<\s>']) + out_file.write(str.encode(' '.join(str(id_) for id_ in ids)) + b'\n') def prepare_raw_data(): print('Preparing raw data into train set and test set ...') @@ -253,4 +254,4 @@ def get_batch(data_bucket, bucket_id, batch_size=1): if __name__ == '__main__': prepare_raw_data() - process_data() \ No newline at end of file + process_data() From ca103f7aba2bf2b87e87b8c0268339a571d33445 Mon Sep 17 00:00:00 2001 From: Xpitfire Date: Wed, 20 Sep 2017 17:44:26 +0200 Subject: [PATCH 2/4] Removed print from script --- assignments/chatbot/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/assignments/chatbot/data.py b/assignments/chatbot/data.py index 9d4b3db7..0bd39a01 100644 --- a/assignments/chatbot/data.py +++ b/assignments/chatbot/data.py @@ -31,7 +31,6 @@ def get_lines(): with open(file_path, 'rb') as f: lines = f.readlines() for line in lines: - print(line) parts = line.split(b' +++$+++ ') if len(parts) == 5: if parts[4][-1] == '\n': From d00da4dcdb124c6bef0e803f17383e02837238b1 Mon Sep 17 00:00:00 2001 From: Xpitfire Date: Wed, 20 Sep 2017 18:07:08 +0200 Subject: [PATCH 3/4] Fixed model.py to use tensorflow version 1 or higher --- assignments/chatbot/model.py | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/assignments/chatbot/model.py b/assignments/chatbot/model.py index 369c0955..b5a9c4a1 100644 --- a/assignments/chatbot/model.py +++ b/assignments/chatbot/model.py @@ -1,7 +1,7 @@ """ A neural chatbot using sequence to sequence model with -attentional decoder. +attentional decoder. -This is based on Google Translate Tensorflow model +This is based on Google Translate Tensorflow model https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/ Sequence to sequence model by Cho et al.(2014) @@ -30,7 +30,7 @@ def __init__(self, forward_only, batch_size): print('Initialize new model') self.fw_only = forward_only self.batch_size = batch_size - + def _create_placeholders(self): # Feeds for inputs. It's a list of placeholders print('Create placeholders') @@ -43,7 +43,7 @@ def _create_placeholders(self): # Our targets are decoder inputs shifted by one (to ignore symbol) self.targets = self.decoder_inputs[1:] - + def _inference(self): print('Create inference') # If we use sampled softmax, we need an output projection. @@ -53,9 +53,9 @@ def _inference(self): b = tf.get_variable('proj_b', [config.DEC_VOCAB]) self.output_projection = (w, b) - def sampled_loss(inputs, labels): + def sampled_loss(labels, logits): labels = tf.reshape(labels, [-1, 1]) - return tf.nn.sampled_softmax_loss(tf.transpose(w), b, inputs, labels, + return tf.nn.sampled_softmax_loss(tf.transpose(w), b, labels, logits, config.NUM_SAMPLES, config.DEC_VOCAB) self.softmax_loss_function = sampled_loss @@ -66,7 +66,7 @@ def _create_loss(self): print('Creating loss... \nIt might take a couple of minutes depending on how many buckets you have.') start = time.time() def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode): - return tf.nn.seq2seq.embedding_attention_seq2seq( + return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, self.cell, num_encoder_symbols=config.ENC_VOCAB, num_decoder_symbols=config.DEC_VOCAB, @@ -75,24 +75,24 @@ def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode): feed_previous=do_decode) if self.fw_only: - self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( - self.encoder_inputs, - self.decoder_inputs, + self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( + self.encoder_inputs, + self.decoder_inputs, self.targets, - self.decoder_masks, - config.BUCKETS, + self.decoder_masks, + config.BUCKETS, lambda x, y: _seq2seq_f(x, y, True), softmax_loss_function=self.softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if self.output_projection: for bucket in range(len(config.BUCKETS)): - self.outputs[bucket] = [tf.matmul(output, + self.outputs[bucket] = [tf.matmul(output, self.output_projection[0]) + self.output_projection[1] for output in self.outputs[bucket]] else: - self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( - self.encoder_inputs, - self.decoder_inputs, + self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( + self.encoder_inputs, + self.decoder_inputs, self.targets, self.decoder_masks, config.BUCKETS, @@ -112,12 +112,12 @@ def _creat_optimizer(self): self.train_ops = [] start = time.time() for bucket in range(len(config.BUCKETS)): - - clipped_grads, norm = tf.clip_by_global_norm(tf.gradients(self.losses[bucket], + + clipped_grads, norm = tf.clip_by_global_norm(tf.gradients(self.losses[bucket], trainables), config.MAX_GRAD_NORM) self.gradient_norms.append(norm) - self.train_ops.append(self.optimizer.apply_gradients(zip(clipped_grads, trainables), + self.train_ops.append(self.optimizer.apply_gradients(zip(clipped_grads, trainables), global_step=self.global_step)) print('Creating opt for bucket {} took {} seconds'.format(bucket, time.time() - start)) start = time.time() From e10ba37e794977606e7b0bcde7738d607091d66c Mon Sep 17 00:00:00 2001 From: Xpitfire Date: Wed, 20 Sep 2017 22:53:44 +0200 Subject: [PATCH 4/4] Fixed inference for chatbot --- assignments/chatbot/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignments/chatbot/data.py b/assignments/chatbot/data.py index 0bd39a01..0198fa19 100644 --- a/assignments/chatbot/data.py +++ b/assignments/chatbot/data.py @@ -148,7 +148,7 @@ def load_vocab(vocab_path): return words, {words[i]: i for i in range(len(words))} def sentence2id(vocab, line): - return [vocab.get(token, vocab[b'']) for token in basic_tokenizer(line)] + return [vocab.get(token, vocab[b'']) for token in basic_tokenizer(str.encode(line))] def token2id(data, mode): """ Convert all the tokens in the data into their corresponding