diff --git a/assignments/chatbot/data.py b/assignments/chatbot/data.py
index 673087d7..0198fa19 100644
--- a/assignments/chatbot/data.py
+++ b/assignments/chatbot/data.py
@@ -1,7 +1,7 @@
 """ A neural chatbot using sequence to sequence model with
-attentional decoder. 
+attentional decoder.
 
-This is based on Google Translate Tensorflow model 
+This is based on Google Translate Tensorflow model
 https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
 
 Sequence to sequence model by Cho et al.(2014)
@@ -31,7 +31,7 @@ def get_lines():
     with open(file_path, 'rb') as f:
         lines = f.readlines()
         for line in lines:
-            parts = line.split(' +++$+++ ')
+            parts = line.split(b' +++$+++ ')
             if len(parts) == 5:
                 if parts[4][-1] == '\n':
                     parts[4] = parts[4][:-1]
@@ -44,10 +44,10 @@ def get_convos():
     convos = []
     with open(file_path, 'rb') as f:
         for line in f.readlines():
-            parts = line.split(' +++$+++ ')
+            parts = line.split(b' +++$+++ ')
             if len(parts) == 4:
                 convo = []
-                for line in parts[3][1:-2].split(', '):
+                for line in parts[3][1:-2].split(b', '):
                     convo.append(line[1:-1])
                 convos.append(convo)
 
@@ -66,10 +66,10 @@ def question_answers(id2line, convos):
 def prepare_dataset(questions, answers):
     # create path to store all the train & test encoder & decoder
     make_dir(config.PROCESSED_PATH)
-    
+
     # random convos to create the test set
     test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE)
-    
+
     filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
     files = []
     for filename in filenames:
@@ -77,11 +77,11 @@ def prepare_dataset(questions, answers):
 
     for i in range(len(questions)):
         if i in test_ids:
-            files[2].write(questions[i] + '\n')
-            files[3].write(answers[i] + '\n')
+            files[2].write(questions[i] + b'\n')
+            files[3].write(answers[i] + b'\n')
         else:
-            files[0].write(questions[i] + '\n')
-            files[1].write(answers[i] + '\n')
+            files[0].write(questions[i] + b'\n')
+            files[1].write(answers[i] + b'\n')
 
     for file in files:
         file.close()
@@ -96,13 +96,13 @@ def make_dir(path):
 def basic_tokenizer(line, normalize_digits=True):
     """ A basic tokenizer to tokenize text into tokens.
     Feel free to change this to suit your need. """
-    line = re.sub('<u>', '', line)
-    line = re.sub('</u>', '', line)
-    line = re.sub('\[', '', line)
-    line = re.sub('\]', '', line)
+    line = re.sub(b'<u>', b'', line)
+    line = re.sub(b'</u>', b'', line)
+    line = re.sub(b'\[', b'', line)
+    line = re.sub(b'\]', b'', line)
     words = []
     _WORD_SPLIT = re.compile(b"([.,!?\"'-<>:;)(])")
-    _DIGIT_RE = re.compile(r"\d")
+    _DIGIT_RE = re.compile(b"\d")
     for fragment in line.strip().lower().split():
         for token in re.split(_WORD_SPLIT, fragment):
             if not token:
@@ -126,20 +126,20 @@ def build_vocab(filename, normalize_digits=True):
 
     sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
     with open(out_path, 'wb') as f:
-        f.write('<pad>' + '\n')
-        f.write('<unk>' + '\n')
-        f.write('<s>' + '\n')
-        f.write('<\s>' + '\n') 
+        f.write(b'<pad>' + b'\n')
+        f.write(b'<unk>' + b'\n')
+        f.write(b'<s>' + b'\n')
+        f.write(b'<\s>' + b'\n')
         index = 4
         for word in sorted_vocab:
             if vocab[word] < config.THRESHOLD:
                 with open('config.py', 'ab') as cf:
                     if filename[-3:] == 'enc':
-                        cf.write('ENC_VOCAB = ' + str(index) + '\n')
+                        cf.write(b'ENC_VOCAB = ' + str.encode(str(index)) + b'\n')
                     else:
-                        cf.write('DEC_VOCAB = ' + str(index) + '\n')
+                        cf.write(b'DEC_VOCAB = ' + str.encode(str(index)) + b'\n')
                 break
-            f.write(word + '\n')
+            f.write(word + b'\n')
             index += 1
 
 def load_vocab(vocab_path):
@@ -148,7 +148,7 @@ def load_vocab(vocab_path):
     return words, {words[i]: i for i in range(len(words))}
 
 def sentence2id(vocab, line):
-    return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]
+    return [vocab.get(token, vocab[b'<unk>']) for token in basic_tokenizer(str.encode(line))]
 
 def token2id(data, mode):
     """ Convert all the tokens in the data into their corresponding
@@ -160,18 +160,18 @@ def token2id(data, mode):
     _, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path))
     in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'rb')
     out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'wb')
-    
+
     lines = in_file.read().splitlines()
     for line in lines:
         if mode == 'dec': # we only care about '<s>' and </s> in encoder
-            ids = [vocab['<s>']]
+            ids = [vocab[b'<s>']]
         else:
             ids = []
         ids.extend(sentence2id(vocab, line))
         # ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
         if mode == 'dec':
-            ids.append(vocab['<\s>'])
-        out_file.write(' '.join(str(id_) for id_ in ids) + '\n')
+            ids.append(vocab[b'<\s>'])
+        out_file.write(str.encode(' '.join(str(id_) for id_ in ids)) + b'\n')
 
 def prepare_raw_data():
     print('Preparing raw data into train set and test set ...')
@@ -253,4 +253,4 @@ def get_batch(data_bucket, bucket_id, batch_size=1):
 
 if __name__ == '__main__':
     prepare_raw_data()
-    process_data()
\ No newline at end of file
+    process_data()
diff --git a/assignments/chatbot/model.py b/assignments/chatbot/model.py
index 369c0955..b5a9c4a1 100644
--- a/assignments/chatbot/model.py
+++ b/assignments/chatbot/model.py
@@ -1,7 +1,7 @@
 """ A neural chatbot using sequence to sequence model with
-attentional decoder. 
+attentional decoder.
 
-This is based on Google Translate Tensorflow model 
+This is based on Google Translate Tensorflow model
 https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
 
 Sequence to sequence model by Cho et al.(2014)
@@ -30,7 +30,7 @@ def __init__(self, forward_only, batch_size):
         print('Initialize new model')
         self.fw_only = forward_only
         self.batch_size = batch_size
-    
+
     def _create_placeholders(self):
         # Feeds for inputs. It's a list of placeholders
         print('Create placeholders')
@@ -43,7 +43,7 @@ def _create_placeholders(self):
 
         # Our targets are decoder inputs shifted by one (to ignore <s> symbol)
         self.targets = self.decoder_inputs[1:]
-        
+
     def _inference(self):
         print('Create inference')
         # If we use sampled softmax, we need an output projection.
@@ -53,9 +53,9 @@ def _inference(self):
             b = tf.get_variable('proj_b', [config.DEC_VOCAB])
             self.output_projection = (w, b)
 
-        def sampled_loss(inputs, labels):
+        def sampled_loss(labels, logits):
             labels = tf.reshape(labels, [-1, 1])
-            return tf.nn.sampled_softmax_loss(tf.transpose(w), b, inputs, labels, 
+            return tf.nn.sampled_softmax_loss(tf.transpose(w), b, labels, logits,
                                               config.NUM_SAMPLES, config.DEC_VOCAB)
         self.softmax_loss_function = sampled_loss
 
@@ -66,7 +66,7 @@ def _create_loss(self):
         print('Creating loss... \nIt might take a couple of minutes depending on how many buckets you have.')
         start = time.time()
         def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
-            return tf.nn.seq2seq.embedding_attention_seq2seq(
+            return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                     encoder_inputs, decoder_inputs, self.cell,
                     num_encoder_symbols=config.ENC_VOCAB,
                     num_decoder_symbols=config.DEC_VOCAB,
@@ -75,24 +75,24 @@ def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                     feed_previous=do_decode)
 
         if self.fw_only:
-            self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
-                                        self.encoder_inputs, 
-                                        self.decoder_inputs, 
+            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
+                                        self.encoder_inputs,
+                                        self.decoder_inputs,
                                         self.targets,
-                                        self.decoder_masks, 
-                                        config.BUCKETS, 
+                                        self.decoder_masks,
+                                        config.BUCKETS,
                                         lambda x, y: _seq2seq_f(x, y, True),
                                         softmax_loss_function=self.softmax_loss_function)
             # If we use output projection, we need to project outputs for decoding.
             if self.output_projection:
                 for bucket in range(len(config.BUCKETS)):
-                    self.outputs[bucket] = [tf.matmul(output, 
+                    self.outputs[bucket] = [tf.matmul(output,
                                             self.output_projection[0]) + self.output_projection[1]
                                             for output in self.outputs[bucket]]
         else:
-            self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
-                                        self.encoder_inputs, 
-                                        self.decoder_inputs, 
+            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
+                                        self.encoder_inputs,
+                                        self.decoder_inputs,
                                         self.targets,
                                         self.decoder_masks,
                                         config.BUCKETS,
@@ -112,12 +112,12 @@ def _creat_optimizer(self):
                 self.train_ops = []
                 start = time.time()
                 for bucket in range(len(config.BUCKETS)):
-                    
-                    clipped_grads, norm = tf.clip_by_global_norm(tf.gradients(self.losses[bucket], 
+
+                    clipped_grads, norm = tf.clip_by_global_norm(tf.gradients(self.losses[bucket],
                                                                  trainables),
                                                                  config.MAX_GRAD_NORM)
                     self.gradient_norms.append(norm)
-                    self.train_ops.append(self.optimizer.apply_gradients(zip(clipped_grads, trainables), 
+                    self.train_ops.append(self.optimizer.apply_gradients(zip(clipped_grads, trainables),
                                                             global_step=self.global_step))
                     print('Creating opt for bucket {} took {} seconds'.format(bucket, time.time() - start))
                     start = time.time()