derochs
diff --git a/‎README.md
Lines changed: 14 additions & 0 deletions b/‎README.md
Lines changed: 14 additions & 0 deletions
diff --git a/‎__pycache__/model.cpython-38.pyc
0 Bytes b/‎__pycache__/model.cpython-38.pyc
0 Bytes
diff --git a/‎evaluate.py
Lines changed: 77 additions & 0 deletions b/‎evaluate.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎generate.py
Lines changed: 0 additions & 13 deletions b/‎generate.py
Lines changed: 0 additions & 13 deletions
diff --git a/‎generate_graph.py
Lines changed: 2 additions & 2 deletions b/‎generate_graph.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎generate_new_trees.py
Lines changed: 11 additions & 0 deletions b/‎generate_new_trees.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎generate_vocab.py
Lines changed: 30 additions & 0 deletions b/‎generate_vocab.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎generator.py
Lines changed: 0 additions & 30 deletions b/‎generator.py
Lines changed: 0 additions & 30 deletions
diff --git a/‎models/trav_trans/__pycache__/dataset.cpython-38.pyc
0 Bytes b/‎models/trav_trans/__pycache__/dataset.cpython-38.pyc
0 Bytes
diff --git a/‎models/trav_trans/generate_ast_ids.py
Lines changed: 26 additions & 0 deletions b/‎models/trav_trans/generate_ast_ids.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎models/trav_trans/generate_data.py
Lines changed: 19 additions & 0 deletions b/‎models/trav_trans/generate_data.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎notebook.ipynb
Lines changed: 176 additions & 53 deletions b/‎notebook.ipynb
Lines changed: 176 additions & 53 deletions
diff --git a/‎preprocess.py
Lines changed: 29 additions & 0 deletions b/‎preprocess.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎trainer.py
Lines changed: 3 additions & 0 deletions b/‎trainer.py
Lines changed: 3 additions & 0 deletions
@@ -85,6 +85,8 @@ Code-Prediction-Transformer is CC-BY-NC 4.0 (Attr Non-Commercial Inter.) (e.g.,
 
 ## Vanilla
 
+### Preprocessing
+
 1. generate_new_trees (nodes only have type/value) `python generate_new_trees.py -i PY150 -o NEW_TREES.json`
 2. generate_data (Splitting and Preorder Traversal) `python models/trav_trans/generate_data.py -a NEW_TREES.json -o DPS.TXT`
 3. generate_vocab (generate vocab files) `python generate_vocab.py -i NEW_TREES.json -o VOCAB.pkl -t ast`
@@ -93,6 +95,18 @@ Code-Prediction-Transformer is CC-BY-NC 4.0 (Attr Non-Commercial Inter.) (e.g.,
 5. Use torch.utils.data.DataLoader to pull batches from Dataset, using the dataset.collate function `dataloder = torch.utils.data.DataLoader(dataset, batch_size=X, collate_fn=lambda b: dataset.collate(b, setup.vocab.pad_idx))`
 6. Iterate through batches and feed to model?
 
+### Evaluation
+
+1. Iterate through Test dataset
+2. For each batch, get leaf_ids from "ids.txt"/"leaf_ids" which contains all type nodes that contain a value leaf node 
+3. Make a model prediction for id-1 to predict the type and then for id to predict the value
+4. Check for "special" nodes, e.g. type "attr" belongs to the special type "attribute access" instead of leaf node prediciton
+      - Attribute Access: `attr`
+      - Numeric Constant: `Arithmetic expression (expr) will be either a numeric constant called Const`
+      - Name (variable, module): `Nameload/Namestore`
+      - Function parameter name: `Nameload`
+5. Calculate the MRR for all predictions and broken down into the four special types
+
 ## HuggingFace
 
 1. generate_new_trees (nodes only have type/value)
 
@@ -0,0 +1,77 @@
+import argparse
+import model
+import torch
+from tqdm import tqdm
+from models.trav_trans import dataset
+
+def generate_test(model, context, device, depth=2, top_k=10):
+    model.eval()
+    with torch.no_grad():
+        context = torch.tensor(context).to(device)
+        output = model(context, None)[-1]
+        top_k_values, top_k_indices = torch.topk(output, top_k)
+        return top_k_values, top_k_indices
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate GPT2 Model")
+    parser.add_argument("--model", help="Specify the model file")
+    parser.add_argument("--dps", help="Specify the data file (dps) on which the model should be tested on")
+    parser.add_argument("--ids", help="Specify the data file (ids) on which the model should be tested on")
+    parser.add_argument("--vocab", help="Specify the vocab file")
+    parser.add_argument("--batch_size", default=1, type=int, help="Specify the batch size")
+
+    args = parser.parse_args()
+
+    setup = dataset.Setup("output", args.dps, args.ids, mode="test")
+
+    m = model.from_file("output/model-8.pt", setup.vocab)
+
+    dataloader = torch.utils.data.DataLoader(
+        setup.dataset,
+        batch_size = args.batch_size,
+        collate_fn = lambda b: dataset.Dataset.collate(b, setup.vocab.pad_idx)
+    )
+    vocab = setup.vocab
+
+    eval(m, dataloader)
+
+def eval(model, dataloader):
+    print("Evaluating {} batches".format(len(dataloader)))
+    reciprocal_rank = {
+        "all_leaf_tokens": [],
+        "attribute_access": [],
+        "numeric_constant": [],
+        "variable_name": [],
+        "function_parameter_name": []
+    }
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model.eval()
+    for i, batch in tqdm(enumerate(dataloader)):
+        if i % 100 == 0:
+            print("Batch {}".format(i))
+        x = batch["input_seq"][0]
+        y = batch["target_seq"][0]
+        ids = batch["ids"]["leaf_ids"]
+
+        for id in ids:
+            if id > 0:
+                y_type = x[id].item()
+                y_value = y[id].item()
+
+                with torch.no_grad():
+                    y_type_pred = generate_test(model, [i.item() for i in x[range(id)]], device)
+                    y_value_pred = generate_test(model, [i.item() for i in x[range(id + 1)]], device)
+
+                    type_rank = 0
+                    value_rank = 0
+
+                    if y_type in y_type_pred[1]:
+                        type_rank = 1 / ((y_type_pred[1] == y_type).nonzero(as_tuple=True)[0].item() + 1)
+                    if y_value in y_value_pred[1]:
+                        value_rank = 1 / ((y_value_pred[1] == y_value).nonzero(as_tuple=True)[0].item() + 1)
+                    reciprocal_rank["all_leaf_tokens"].append((type_rank + value_rank) / 2)
+
+if __name__ == "__main__":
+    main()
@@ -10,9 +10,9 @@ def addChildren(i, data, graph):
                 if "value" in data[c] and "type" in data[c]:
                     graph.add_node(pydot.Node(c, label=data[c]["type"] + "\n{}".format(data[c]["value"])))
                 elif "value" in data[c] and not "type" in data[c]:
-                    graph.add_node(pydot.Node(c, label=data[c]["value"]))
+                    graph.add_node(pydot.Node(c, label="{}\n".format(c) + data[c]["value"]))
                 elif "value" not in data[c] and "type" in data[c]:
-                    graph.add_node(pydot.Node(c, label=data[c]["type"]))
+                    graph.add_node(pydot.Node(c, label="{}\n".format(c) + data[c]["type"]))
                 graph.add_edge(pydot.Edge(i, c, color="blue"))
                 addChildren(c, data, graph)
 
 
@@ -47,6 +47,17 @@ def convert(ast):
     assert len(children) == len(set(children))
     return new_dp
 
+def external(file_path, suffix):
+    outfile = "output/{}_new_trees.json".format(suffix)
+    if os.path.exists(outfile):
+        os.remove(outfile)
+    logging.info("Loading asts from: {}".format(file_path))
+    with open(file_path, "r") as f, open(outfile, "w") as fout:
+        for line in file_tqdm(f):
+            dp = json.loads(line.strip())
+            print(json.dumps(convert(dp)), file=fout)
+    logging.info("Wrote dps to: {}".format(outfile))
+    
 
 def main():
     parser = argparse.ArgumentParser(description="Generate datapoints from AST")
 
@@ -29,6 +29,36 @@ def get_value(line, input_type):
     elif input_type == "source_code":
         return line[0]
 
+def external(file_path, n_vocab):
+    outfile = "output/vocab.pkl"
+    logging.info("Reading from: {}".format(file_path))
+    vocab = Counter()
+    with open(file_path, "r") as f:
+        for line in file_tqdm(f):
+            vocab.update(get_value(json.loads(line.strip()), "ast"))
+    vocab_to_keep = [i[0] for i in vocab.most_common(n_vocab)]
+    top_total = sum(i[1] for i in vocab.most_common(n_vocab))
+    total = sum(vocab.values())
+
+    logging.info("Total # of vocab: {}".format(len(vocab)))
+    logging.info(
+        "Using {} top vocab covers: {:.2f}% of the entire dataset".format(
+            n_vocab, 100 * top_total / total
+        )
+    )
+    logging.info("Top 10 most common vocab:")
+    for v, i in vocab.most_common(10):
+        print(v, i)
+
+    # add unk and pad tokens
+    vocab_to_keep.append(UNK)
+    vocab_to_keep.append(PAD)
+    logging.info("Added {} and {}".format(UNK, PAD))
+
+    # dump vocab to file
+    with open(outfile, "wb") as fout:
+        pickle.dump(vocab_to_keep, fout)
+    logging.info("Wrote {} vocab to: {}".format(len(vocab_to_keep), outfile))
 
 def main():
     parser = argparse.ArgumentParser(description="Create vocab for py150 dataset")
 
@@ -70,6 +70,32 @@ def get_type_ids(ast):
     return ids
 
 
+def external(file_path, suffix, n_ctx):
+    outfile = "output/{}_ids.txt".format(suffix)
+
+    if os.path.exists(outfile):
+        os.remove(outfile)
+    logging.info("Type of id to get: {}".format("leaf"))
+
+    logging.info("Loading dps from: {}".format(file_path))
+    with open(file_path, "r") as f, open(outfile, "w") as fout:
+        for line in file_tqdm(f):
+            dp = json.loads(line.strip())
+            asts = separate_dps(dp, n_ctx)
+            for ast, _ in asts:
+                ids = {}
+                if len(ast) > 1:
+                    if "leaf" in {"leaf", "all"}:
+                        ids.update(get_leaf_ids(ast))
+                    if "leaf" in {"value", "all"}:
+                        ids.update(get_value_ids(ast))
+                    if "leaf" in {"type", "all"}:
+                        ids.update(get_type_ids(ast))
+
+                    json.dump(ids, fp=fout) 
+                    fout.write("\n")
+    logging.info("Wrote to: {}".format(outfile))
+
 def main():
     parser = argparse.ArgumentParser(
         description="Generate ids (leaf, values, types) from AST"
 
@@ -15,6 +15,25 @@
 
 logging.basicConfig(level=logging.INFO)
 
+def external(file_path, suffix, context_size):
+    outfile = "output/{}_dps.txt".format(suffix)
+    if os.path.exists(outfile):
+        os.remove(outfile)
+    logging.info("Number of context: {}".format(context_size))
+
+    num_dps = 0
+    logging.info("Loading asts from: {}".format(file_path))
+    with open(file_path, "r") as f, open(outfile, "w") as fout:
+        for line in file_tqdm(f):
+            dp = json.loads(line.strip())
+            asts = separate_dps(dp, context_size)
+            for ast, extended in asts:
+                if len(ast) > 1:
+                    json.dump([get_dfs(ast), extended], fp=fout)
+                    fout.write("\n")
+                    num_dps += 1
+
+    logging.info("Wrote {} datapoints to {}".format(num_dps, outfile))
 
 def main():
     parser = argparse.ArgumentParser(description="Generate datapoints from AST")
 
@@ -0,0 +1,29 @@
+import os
+import argparse
+
+import generate_new_trees
+import generate_vocab
+from models.trav_trans import generate_data, generate_ast_ids
+
+def main():
+    parser = argparse.ArgumentParser(description="Preprocess py150 train and eval files")
+    parser.add_argument("--file_path", help="Specify py150 file path")
+    parser.add_argument("--suffix", help="Specify suffix to determine between train/val/test files")
+    parser.add_argument("--context_size", default=1000, type=int, help="Specify context size for slicing larger ASTs")
+    parser.add_argument("--generate_vocab", default=False, type=bool, help="Specify wether or not to generate a vocab file")
+    parser.add_argument("--n_vocab", default=100000, type=int, help="Specify the vocab size")
+
+    args = parser.parse_args()
+
+    # Generate new trees
+    generate_new_trees.external(args.file_path, args.suffix)
+    # Generate DPS
+    generate_data.external("output/{}_new_trees.json".format(args.suffix), args.suffix, args.context_size)
+    # Generate Vocab
+    if args.generate_vocab:
+        generate_vocab.external("output/{}_new_trees.json".format(args.suffix), args.n_vocab)
+    # Generate AST IDs
+    generate_ast_ids.external("output/{}_new_trees.json".format(args.suffix), args.suffix, args.context_size)
+
+if __name__ == "__main__":
+    main()
@@ -48,11 +48,14 @@ def train(self):
                 loss = self.model(x, y, ext, return_loss = True)
                 loss.backward()
                 if batch_counter % 8 == 0:
+                # Accumulate gradients over 8 iterations
                     self.optimizer.step()
                     self.optimizer.zero_grad()
                     self.model.zero_grad()
+                # All 100 batches save losses
                 if batch_counter % 100 == 0:
                     losses.append([epoch, i, loss.item()])
+                # All 1,000 batches, output current metrics and save losses file
                 if batch_counter % 1000 == 0:
                     print("Epoch {}, It. {}/{}, Loss {}".format(epoch, i, self.dataset.__len__() / self.batch_size, loss))
                     with open(os.path.join(self.output_dir, "losses.pickle"), "wb") as fout: