mon95
diff --git a/‎Code/data_helper.py
+129 b/‎Code/data_helper.py
+129
diff --git a/‎Code/metrics.py
+24 b/‎Code/metrics.py
+24
diff --git a/‎Data/totto_data_small/processed_totto_dev_data_500.jsonl
+500 b/‎Data/totto_data_small/processed_totto_dev_data_500.jsonl
+500
diff --git a/‎Data/totto_data_small/processed_totto_train_data_1000.jsonl
+1,000 b/‎Data/totto_data_small/processed_totto_train_data_1000.jsonl
+1,000
diff --git a/‎Data/totto_data_small/totto_dev_data_500.jsonl
+500 b/‎Data/totto_data_small/totto_dev_data_500.jsonl
+500
diff --git a/‎Data/totto_data_small/totto_train_data_1000.jsonl
+1,000 b/‎Data/totto_data_small/totto_train_data_1000.jsonl
+1,000
diff --git a/‎Models/Baselines.py
+118 b/‎Models/Baselines.py
+118
diff --git a/‎README.md
+36 b/‎README.md
+36
diff --git a/‎__init__.py
+15 b/‎__init__.py
+15
diff --git a/‎baseline_preprocessing/__init__.py
+15 b/‎baseline_preprocessing/__init__.py
+15
@@ -0,0 +1,129 @@
+import torch
+from torch import nn
+from torch.autograd import Variable
+import torch.nn.functional as F
+from torch.utils import data
+from sklearn.model_selection import train_test_split
+import json
+import time
+from torchtext import data, datasets, vocab
+from argparse import ArgumentParser
+from torch.utils.tensorboard import SummaryWriter
+
+import random, math
+from numpy.random import seed
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error
+import random, tqdm, sys, math, gzip
+import nlp
+
+class Dataset(data.Dataset):
+    def __init__(self, inputs, targets):
+        assert len(inputs) == len(targets), 'Length of inputs and targets should be same.'
+        self.inputs = inputs
+        self.targets = targets
+
+    def __len__(self):
+        return len(self.inputs)
+
+    def __getitem__(self, index):
+        return self.inputs[index], self.targets[index]
+    
+def create_datasets(arg):
+    if (arg.task == 'TOTTO'):
+        return create_totto_datasets(arg)
+    elif (arg.task == 'cnn_dailymail'):
+        return create_cnn_dailymail_datasets(arg)
+    else:
+        assert False, f'Data fetching for {arg.task} not defined'
+
+#For Totto, we do not have the reference sentences for test set.
+#So, we can divide the training set into train and validation
+#and use actual validation set as test set
+def read_file(arg, f):
+    with open(f) as fp:
+        lines = fp.readlines()
+        inputs = []
+        targets = []
+        for line in lines:
+            entry = json.loads(line)
+            # print(entry)
+            if (arg.input_string == 'raw_input'):
+                target = entry['sentence_annotations'][0]['final_sentence']
+                entry.pop('sentence_annotations')
+                input = json.dumps(entry)
+            elif (arg.input_string == 'subtable_str_plus_subtable_metadata_str'):
+                target = entry['sentence_annotations'][0]['final_sentence']
+                input = entry['subtable_metadata_str'] + ' ' + entry['subtable_str']
+            inputs.append(input)
+            targets.append(target)
+    return inputs, targets
+def create_totto_datasets(arg):
+    #arg.train_input, arg.development_input
+    start = time.time()
+    if (arg.toy_dataset):
+        print('Using toy dataset..')
+
+    inputs, targets = read_file(arg, arg.train_input)
+    if (arg.toy_dataset):
+        inputs = inputs[:arg.toy_dataset]
+        targets = targets[:arg.toy_dataset]
+
+    num_datapoints = len(inputs)
+    val_set_size = max(1, int(0.125 * num_datapoints))
+    train, val = train_test_split(inputs, test_size=val_set_size, shuffle=False, random_state=0)
+    train_t, val_t = train_test_split(targets, test_size=val_set_size, shuffle=False, random_state=0)
+
+
+    training_set = Dataset(train, train_t)
+    validation_set = Dataset(val, val_t)
+    assert len(validation_set) == val_set_size, 'Validation size not matching'
+    assert len(training_set) == (num_datapoints - val_set_size), 'Training size not matching'
+
+    inputs, targets = read_file(arg, arg.development_input)
+    if (arg.toy_dataset):
+        inputs = inputs[:arg.toy_dataset]
+        targets = targets[:arg.toy_dataset]    
+    test_set = Dataset(inputs, targets)
+    end = time.time()
+    print('Time taken to create datasets is %0.2f mins'%((end-start)/60))
+    return training_set, validation_set, test_set
+
+def create_cnn_dailymail_datasets(arg):
+    def convert(dataset, arg):
+        inputs = []
+        targets = []
+        for i in np.arange(len(dataset)):
+            i = int(i)
+            # print(dataset[i])
+            inputs.append(dataset[i]['article'])
+            targets.append(dataset[i]['highlights'])
+        if (arg.toy_dataset):
+            inputs = inputs[:arg.toy_dataset]
+            targets = targets[:arg.toy_dataset]  
+        return Dataset(inputs, targets)
+
+    train_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
+    val_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
+    test_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")
+    
+    training_set = convert(train_dataset, arg)
+    validation_set = convert(val_dataset, arg)
+    test_set = convert(test_dataset, arg)
+    return training_set, validation_set, test_set
+
+
+def create_dataloaders(arg, training_set, validation_set, test_set):
+    start = time.time()
+    trainloader, valloader, testloader = None, None, None
+    if (training_set is not None):
+        trainloader=torch.utils.data.DataLoader(training_set, batch_size=arg.batch_size, shuffle=arg.shuffle_train, num_workers=2) 
+    if (validation_set is not None):
+        valloader=torch.utils.data.DataLoader(validation_set, batch_size=arg.batch_size, shuffle=False, num_workers=2)
+    if (test_set is not None):
+        testloader = torch.utils.data.DataLoader(test_set, batch_size=arg.batch_size, shuffle=False, num_workers=2)    
+    end = time.time()
+    return trainloader, valloader, testloader    
+
+
@@ -0,0 +1,24 @@
+from nltk.translate.bleu_score import sentence_bleu
+import numpy as np
+from datasets import load_metric
+
+def compute_metric(arg, target_sentences, generated_sentences):
+    avg_score = None
+    if (arg.metric == 'BLEU'):
+        avg_score = compute_bleu_scores(target_sentences, generated_sentences)
+    elif (arg.metric == 'ROUGE'):
+        avg_score = compute_rogue_scores(target_sentences, generated_sentences)
+    else:
+        assert False, f'{arg.metric} not defined'
+    return avg_score
+
+def compute_bleu_scores(target_sentences, generated_sentences):
+    bleu_scores = [sentence_bleu([target_sentences[i].split()], generated_sentences[i].split()) for i, sen in enumerate(generated_sentences)]
+    return np.mean(bleu_scores)
+
+def compute_rogue_scores(target_sentences, generated_sentences):
+    metric = load_metric('rouge')
+    metric.add_batch(predictions=generated_sentences, references=target_sentences)
+    score = metric.compute()
+    rougeL_f = score['rougeL'].mid.fmeasure
+    return rougeL_f
@@ -0,0 +1,118 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+class Bert2Bert(nn.Module):
+    def __init__(self):
+        super().__init__()
+        from transformers import EncoderDecoderModel
+        from transformers import BertTokenizer
+        self.seq2seq_model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints      
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+    def get_model_inputs(self, inputs, targets, device):
+        tokenizer_op = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
+        encoder_input_ids = tokenizer_op['input_ids'].to(device)
+        encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
+
+        tokenizer_op = self.tokenizer(targets, padding=True, truncation=True, return_tensors='pt')
+        decoder_inputs = tokenizer_op['input_ids'].to(device)
+        labels = decoder_inputs[:, 1:].clone()  
+        labels[decoder_inputs[:, 1:] == self.tokenizer.pad_token_id] = -100 
+        decoder_input_ids = decoder_inputs[:, :-1].clone()
+        decoder_attention_mask = tokenizer_op['attention_mask'].to(device)
+        decoder_attention_mask = decoder_attention_mask[:, :-1]
+        model_inputs = {
+            'input_ids': encoder_input_ids,
+            'attention_mask': encoder_attention_mask,
+            'decoder_input_ids': decoder_input_ids,
+            'labels': labels,
+            'decoder_attention_mask': decoder_attention_mask,
+            'return_dict': True
+        }
+        return model_inputs
+
+    def get_loss(self, outputs):
+        loss, logits = outputs.loss, outputs.logits
+        return loss
+
+    def generate(self, inputs, device):
+        tokenizer_op = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
+        encoder_input_ids = tokenizer_op['input_ids'].to(device)
+        encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
+        model_inputs = {
+            'input_ids': encoder_input_ids,
+            'attention_mask': encoder_attention_mask,
+            'bos_token_id': self.tokenizer.cls_token_id,
+            'eos_token_id': self.tokenizer.sep_token_id,
+            'pad_token_id': self.tokenizer.sep_token_id,
+            'num_beams': 2,
+            'early_stopping': True,
+            # 'no_repeat_ngram_size': 3,
+            'repetition_penalty': 2.5,
+            # 'length_penalty': 1.0,
+            'max_length': 50
+
+        }
+        pred_ids = self.seq2seq_model.generate(**model_inputs)
+        return self.tokenizer.batch_decode(pred_ids)
+
+    def forward(self, **model_inputs):
+        # print(model_inputs)
+        return self.seq2seq_model(**model_inputs)
+
+class T5(nn.Module):
+    def __init__(self):
+        super().__init__()
+        PRETRAINED_MODEL_NAME = 't5-small'
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
+        self.t5_model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL_NAME, return_dict=True) 
+
+    def get_model_inputs(self, inputs, targets, device):
+        # Inputs
+        tokenizer_op = self.t5_tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
+        encoder_input_ids = tokenizer_op['input_ids'].to(device)
+        encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
+
+        # Labels
+        tokenizer_op = self.t5_tokenizer(targets, padding=True, truncation=True, return_tensors='pt')
+        decoder_input_ids = tokenizer_op['input_ids'].to(device)
+
+        # Model Inputs
+        model_inputs = {
+            'input_ids': encoder_input_ids,
+            'attention_mask': encoder_attention_mask,
+            'labels' : decoder_input_ids,
+            'return_dict': True
+        }
+
+        return model_inputs
+
+    def get_loss(self, outputs):
+        return outputs.loss 
+
+    def generate(self, inputs, device):
+        tokenizer_op = self.t5_tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
+        encoder_input_ids = tokenizer_op['input_ids'].to(device)
+        encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
+        model_inputs = {
+            'input_ids': encoder_input_ids,
+            'attention_mask': encoder_attention_mask,
+            'bos_token_id': self.t5_tokenizer.cls_token_id,
+            'eos_token_id': self.t5_tokenizer.sep_token_id,
+            'pad_token_id': self.t5_tokenizer.sep_token_id,
+            'num_beams': 2,
+            'early_stopping': True,
+            # 'no_repeat_ngram_size': 3,
+            'repetition_penalty': 2.5,
+            # 'length_penalty': 1.0,
+            'max_length': 50
+
+        }
+        pred_ids = self.t5_model.generate(**model_inputs)
+        return self.t5_tokenizer.batch_decode(pred_ids)
+
+    def forward(self, **model_inputs):
+        return self.t5_model(**model_inputs)
@@ -0,0 +1,36 @@
+# table-to-text-generation
+
+Use following command to run the training script.
+```
+python run.py --mode train \
+                --model Bert2Bert \
+                --num_epochs 2\
+                --cuda \
+                --batch_size 8\
+                --train_input '/content/drive/My Drive/DLT/table-to-text-generation/Data/totto_data_small/totto_train_data_1000.jsonl'\
+                --development_input '/content/drive/My Drive/DLT/table-to-text-generation/Data/totto_data_small/totto_dev_data_500.jsonl'
+               
+```
+
+To run the T5 model, simply run:
+
+```
+bash train.sh
+```
+This invokes the trainer.py script with T5 as the model and the processed data files (will need to change the data file locations accordingly)
+
+Code:
+
+Preprocessing and Evaluation Scripts are borrowed from the [ToTTo repository](https://github.com/google-research/language/tree/master/language/totto)
+
+
+References:
+
+```
+@article{parikh2020totto,
+  title={ToTTo: A Controlled Table-To-Text Generation Dataset},
+  author={Parikh, Ankur P and Wang, Xuezhi and Gehrmann, Sebastian and Faruqui, Manaal and Dhingra, Bhuwan and Yang, Diyi and Das, Dipanjan},
+  journal={arXiv preprint arXiv:2004.14373},
+  year={2020}
+```
+
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+