Skip to content

Commit 552ebfe

Browse files
author
Sreehari
committed
Move repo from private to public
0 parents  commit 552ebfe

33 files changed

+5669
-0
lines changed

Code/data_helper.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import torch
2+
from torch import nn
3+
from torch.autograd import Variable
4+
import torch.nn.functional as F
5+
from torch.utils import data
6+
from sklearn.model_selection import train_test_split
7+
import json
8+
import time
9+
from torchtext import data, datasets, vocab
10+
from argparse import ArgumentParser
11+
from torch.utils.tensorboard import SummaryWriter
12+
13+
import random, math
14+
from numpy.random import seed
15+
import pandas as pd
16+
import numpy as np
17+
from sklearn.metrics import mean_squared_error
18+
import random, tqdm, sys, math, gzip
19+
import nlp
20+
21+
class Dataset(data.Dataset):
22+
def __init__(self, inputs, targets):
23+
assert len(inputs) == len(targets), 'Length of inputs and targets should be same.'
24+
self.inputs = inputs
25+
self.targets = targets
26+
27+
def __len__(self):
28+
return len(self.inputs)
29+
30+
def __getitem__(self, index):
31+
return self.inputs[index], self.targets[index]
32+
33+
def create_datasets(arg):
34+
if (arg.task == 'TOTTO'):
35+
return create_totto_datasets(arg)
36+
elif (arg.task == 'cnn_dailymail'):
37+
return create_cnn_dailymail_datasets(arg)
38+
else:
39+
assert False, f'Data fetching for {arg.task} not defined'
40+
41+
#For Totto, we do not have the reference sentences for test set.
42+
#So, we can divide the training set into train and validation
43+
#and use actual validation set as test set
44+
def read_file(arg, f):
45+
with open(f) as fp:
46+
lines = fp.readlines()
47+
inputs = []
48+
targets = []
49+
for line in lines:
50+
entry = json.loads(line)
51+
# print(entry)
52+
if (arg.input_string == 'raw_input'):
53+
target = entry['sentence_annotations'][0]['final_sentence']
54+
entry.pop('sentence_annotations')
55+
input = json.dumps(entry)
56+
elif (arg.input_string == 'subtable_str_plus_subtable_metadata_str'):
57+
target = entry['sentence_annotations'][0]['final_sentence']
58+
input = entry['subtable_metadata_str'] + ' ' + entry['subtable_str']
59+
inputs.append(input)
60+
targets.append(target)
61+
return inputs, targets
62+
def create_totto_datasets(arg):
63+
#arg.train_input, arg.development_input
64+
start = time.time()
65+
if (arg.toy_dataset):
66+
print('Using toy dataset..')
67+
68+
inputs, targets = read_file(arg, arg.train_input)
69+
if (arg.toy_dataset):
70+
inputs = inputs[:arg.toy_dataset]
71+
targets = targets[:arg.toy_dataset]
72+
73+
num_datapoints = len(inputs)
74+
val_set_size = max(1, int(0.125 * num_datapoints))
75+
train, val = train_test_split(inputs, test_size=val_set_size, shuffle=False, random_state=0)
76+
train_t, val_t = train_test_split(targets, test_size=val_set_size, shuffle=False, random_state=0)
77+
78+
79+
training_set = Dataset(train, train_t)
80+
validation_set = Dataset(val, val_t)
81+
assert len(validation_set) == val_set_size, 'Validation size not matching'
82+
assert len(training_set) == (num_datapoints - val_set_size), 'Training size not matching'
83+
84+
inputs, targets = read_file(arg, arg.development_input)
85+
if (arg.toy_dataset):
86+
inputs = inputs[:arg.toy_dataset]
87+
targets = targets[:arg.toy_dataset]
88+
test_set = Dataset(inputs, targets)
89+
end = time.time()
90+
print('Time taken to create datasets is %0.2f mins'%((end-start)/60))
91+
return training_set, validation_set, test_set
92+
93+
def create_cnn_dailymail_datasets(arg):
94+
def convert(dataset, arg):
95+
inputs = []
96+
targets = []
97+
for i in np.arange(len(dataset)):
98+
i = int(i)
99+
# print(dataset[i])
100+
inputs.append(dataset[i]['article'])
101+
targets.append(dataset[i]['highlights'])
102+
if (arg.toy_dataset):
103+
inputs = inputs[:arg.toy_dataset]
104+
targets = targets[:arg.toy_dataset]
105+
return Dataset(inputs, targets)
106+
107+
train_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
108+
val_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
109+
test_dataset = nlp.load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")
110+
111+
training_set = convert(train_dataset, arg)
112+
validation_set = convert(val_dataset, arg)
113+
test_set = convert(test_dataset, arg)
114+
return training_set, validation_set, test_set
115+
116+
117+
def create_dataloaders(arg, training_set, validation_set, test_set):
118+
start = time.time()
119+
trainloader, valloader, testloader = None, None, None
120+
if (training_set is not None):
121+
trainloader=torch.utils.data.DataLoader(training_set, batch_size=arg.batch_size, shuffle=arg.shuffle_train, num_workers=2)
122+
if (validation_set is not None):
123+
valloader=torch.utils.data.DataLoader(validation_set, batch_size=arg.batch_size, shuffle=False, num_workers=2)
124+
if (test_set is not None):
125+
testloader = torch.utils.data.DataLoader(test_set, batch_size=arg.batch_size, shuffle=False, num_workers=2)
126+
end = time.time()
127+
return trainloader, valloader, testloader
128+
129+

Code/metrics.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from nltk.translate.bleu_score import sentence_bleu
2+
import numpy as np
3+
from datasets import load_metric
4+
5+
def compute_metric(arg, target_sentences, generated_sentences):
6+
avg_score = None
7+
if (arg.metric == 'BLEU'):
8+
avg_score = compute_bleu_scores(target_sentences, generated_sentences)
9+
elif (arg.metric == 'ROUGE'):
10+
avg_score = compute_rogue_scores(target_sentences, generated_sentences)
11+
else:
12+
assert False, f'{arg.metric} not defined'
13+
return avg_score
14+
15+
def compute_bleu_scores(target_sentences, generated_sentences):
16+
bleu_scores = [sentence_bleu([target_sentences[i].split()], generated_sentences[i].split()) for i, sen in enumerate(generated_sentences)]
17+
return np.mean(bleu_scores)
18+
19+
def compute_rogue_scores(target_sentences, generated_sentences):
20+
metric = load_metric('rouge')
21+
metric.add_batch(predictions=generated_sentences, references=target_sentences)
22+
score = metric.compute()
23+
rougeL_f = score['rougeL'].mid.fmeasure
24+
return rougeL_f

Data/totto_data_small/processed_totto_dev_data_500.jsonl

+500
Large diffs are not rendered by default.

Data/totto_data_small/processed_totto_train_data_1000.jsonl

+1,000
Large diffs are not rendered by default.

Data/totto_data_small/totto_dev_data_500.jsonl

+500
Large diffs are not rendered by default.

Data/totto_data_small/totto_train_data_1000.jsonl

+1,000
Large diffs are not rendered by default.

Models/Baselines.py

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import torch
2+
from torch import nn
3+
import torch.nn.functional as F
4+
5+
class Bert2Bert(nn.Module):
6+
def __init__(self):
7+
super().__init__()
8+
from transformers import EncoderDecoderModel
9+
from transformers import BertTokenizer
10+
self.seq2seq_model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
11+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
12+
13+
def get_model_inputs(self, inputs, targets, device):
14+
tokenizer_op = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
15+
encoder_input_ids = tokenizer_op['input_ids'].to(device)
16+
encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
17+
18+
tokenizer_op = self.tokenizer(targets, padding=True, truncation=True, return_tensors='pt')
19+
decoder_inputs = tokenizer_op['input_ids'].to(device)
20+
labels = decoder_inputs[:, 1:].clone()
21+
labels[decoder_inputs[:, 1:] == self.tokenizer.pad_token_id] = -100
22+
decoder_input_ids = decoder_inputs[:, :-1].clone()
23+
decoder_attention_mask = tokenizer_op['attention_mask'].to(device)
24+
decoder_attention_mask = decoder_attention_mask[:, :-1]
25+
model_inputs = {
26+
'input_ids': encoder_input_ids,
27+
'attention_mask': encoder_attention_mask,
28+
'decoder_input_ids': decoder_input_ids,
29+
'labels': labels,
30+
'decoder_attention_mask': decoder_attention_mask,
31+
'return_dict': True
32+
}
33+
return model_inputs
34+
35+
def get_loss(self, outputs):
36+
loss, logits = outputs.loss, outputs.logits
37+
return loss
38+
39+
def generate(self, inputs, device):
40+
tokenizer_op = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
41+
encoder_input_ids = tokenizer_op['input_ids'].to(device)
42+
encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
43+
model_inputs = {
44+
'input_ids': encoder_input_ids,
45+
'attention_mask': encoder_attention_mask,
46+
'bos_token_id': self.tokenizer.cls_token_id,
47+
'eos_token_id': self.tokenizer.sep_token_id,
48+
'pad_token_id': self.tokenizer.sep_token_id,
49+
'num_beams': 2,
50+
'early_stopping': True,
51+
# 'no_repeat_ngram_size': 3,
52+
'repetition_penalty': 2.5,
53+
# 'length_penalty': 1.0,
54+
'max_length': 50
55+
56+
}
57+
pred_ids = self.seq2seq_model.generate(**model_inputs)
58+
return self.tokenizer.batch_decode(pred_ids)
59+
60+
def forward(self, **model_inputs):
61+
# print(model_inputs)
62+
return self.seq2seq_model(**model_inputs)
63+
64+
class T5(nn.Module):
65+
def __init__(self):
66+
super().__init__()
67+
PRETRAINED_MODEL_NAME = 't5-small'
68+
from transformers import T5Tokenizer, T5ForConditionalGeneration
69+
70+
self.t5_tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
71+
self.t5_model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL_NAME, return_dict=True)
72+
73+
def get_model_inputs(self, inputs, targets, device):
74+
# Inputs
75+
tokenizer_op = self.t5_tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
76+
encoder_input_ids = tokenizer_op['input_ids'].to(device)
77+
encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
78+
79+
# Labels
80+
tokenizer_op = self.t5_tokenizer(targets, padding=True, truncation=True, return_tensors='pt')
81+
decoder_input_ids = tokenizer_op['input_ids'].to(device)
82+
83+
# Model Inputs
84+
model_inputs = {
85+
'input_ids': encoder_input_ids,
86+
'attention_mask': encoder_attention_mask,
87+
'labels' : decoder_input_ids,
88+
'return_dict': True
89+
}
90+
91+
return model_inputs
92+
93+
def get_loss(self, outputs):
94+
return outputs.loss
95+
96+
def generate(self, inputs, device):
97+
tokenizer_op = self.t5_tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
98+
encoder_input_ids = tokenizer_op['input_ids'].to(device)
99+
encoder_attention_mask = tokenizer_op['attention_mask'].to(device)
100+
model_inputs = {
101+
'input_ids': encoder_input_ids,
102+
'attention_mask': encoder_attention_mask,
103+
'bos_token_id': self.t5_tokenizer.cls_token_id,
104+
'eos_token_id': self.t5_tokenizer.sep_token_id,
105+
'pad_token_id': self.t5_tokenizer.sep_token_id,
106+
'num_beams': 2,
107+
'early_stopping': True,
108+
# 'no_repeat_ngram_size': 3,
109+
'repetition_penalty': 2.5,
110+
# 'length_penalty': 1.0,
111+
'max_length': 50
112+
113+
}
114+
pred_ids = self.t5_model.generate(**model_inputs)
115+
return self.t5_tokenizer.batch_decode(pred_ids)
116+
117+
def forward(self, **model_inputs):
118+
return self.t5_model(**model_inputs)

README.md

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# table-to-text-generation
2+
3+
Use following command to run the training script.
4+
```
5+
python run.py --mode train \
6+
--model Bert2Bert \
7+
--num_epochs 2\
8+
--cuda \
9+
--batch_size 8\
10+
--train_input '/content/drive/My Drive/DLT/table-to-text-generation/Data/totto_data_small/totto_train_data_1000.jsonl'\
11+
--development_input '/content/drive/My Drive/DLT/table-to-text-generation/Data/totto_data_small/totto_dev_data_500.jsonl'
12+
13+
```
14+
15+
To run the T5 model, simply run:
16+
17+
```
18+
bash train.sh
19+
```
20+
This invokes the trainer.py script with T5 as the model and the processed data files (will need to change the data file locations accordingly)
21+
22+
Code:
23+
24+
Preprocessing and Evaluation Scripts are borrowed from the [ToTTo repository](https://github.com/google-research/language/tree/master/language/totto)
25+
26+
27+
References:
28+
29+
```
30+
@article{parikh2020totto,
31+
title={ToTTo: A Controlled Table-To-Text Generation Dataset},
32+
author={Parikh, Ankur P and Wang, Xuezhi and Gehrmann, Sebastian and Faruqui, Manaal and Dhingra, Bhuwan and Yang, Diyi and Das, Dipanjan},
33+
journal={arXiv preprint arXiv:2004.14373},
34+
year={2020}
35+
```
36+

__init__.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2018 The Google AI Language Team Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+

baseline_preprocessing/__init__.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# coding=utf-8
2+
# Copyright 2018 The Google AI Language Team Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+

0 commit comments

Comments
 (0)