From a857c570d8e9460ad8bf0b60c3bdbf50ec6d1a13 Mon Sep 17 00:00:00 2001
From: Zacharias Fisches <zacharias.vf@gmail.com>
Date: Thu, 27 Aug 2020 10:36:35 +0200
Subject: [PATCH 1/5] copy from github.com/Zacharias030/ProGraML

github.com/ChrisCummins/ProGraML/issues/81
---
 .../graph_level_classification/configs.py     |  281 ++++
 .../graph_level_classification/dataloader.py  |  105 ++
 .../graph_level_classification/dataset.py     | 1238 +++++++++++++++
 .../graph_level_classification/modeling.py    | 1382 +++++++++++++++++
 .../task/graph_level_classification/run.py    |  831 ++++++++++
 5 files changed, 3837 insertions(+)
 create mode 100644 programl/task/graph_level_classification/configs.py
 create mode 100644 programl/task/graph_level_classification/dataloader.py
 create mode 100644 programl/task/graph_level_classification/dataset.py
 create mode 100644 programl/task/graph_level_classification/modeling.py
 create mode 100644 programl/task/graph_level_classification/run.py

diff --git a/programl/task/graph_level_classification/configs.py b/programl/task/graph_level_classification/configs.py
new file mode 100644
index 000000000..f22b540c8
--- /dev/null
+++ b/programl/task/graph_level_classification/configs.py
@@ -0,0 +1,281 @@
+# Copyright 2019 the ProGraML authors.
+#
+# Contact Chris Cummins <chrisc.101@gmail.com>.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""C."""
+from typing import List
+from deeplearning.ml4pl.poj104.dataset import AblationVocab
+
+
+class ProGraMLBaseConfig(object):
+    def __init__(self):
+        self.name = self.__class__.__name__
+
+        # Training Hyperparameters
+        self.num_epochs = 25
+        self.batch_size = 128
+        # limit the number of nodes per batch to a sensible maximum
+        # by possibly discarding certain samples from the batch.
+        self.max_num_nodes = 200000
+        self.lr: float = 0.00025
+        self.patience = 10000
+        self.clip_grad_norm: float = 0.0
+        self.train_subset = [0, 100]
+        self.random_seed: int = 42
+
+        # Readout
+        self.output_dropout: float = 0.0
+
+        # Model Hyperparameters
+        self.emb_size: int = 200
+        self.edge_type_count: int = 3
+
+        self.vocab_size: int = 8568
+        self.cdfg_vocab: bool = False
+        
+        # ABLATION OPTIONS
+        # NONE = 0 No ablation - use the full vocabulary (default).
+        # NO_VOCAB = 1 Ignore the vocabulary - every node has an x value of 0.
+        # NODE_TYPE_ONLY = 2 Use a 3-element vocabulary based on the node type:
+        #    0 - Instruction node
+        #    1 - Variable node
+        #    2 - Constant node
+        self.ablation_vocab: AblationVocab = 0  # 0 NONE, 1 NO_VOCAB, 2 NODE_TYPE_ONLY
+
+        # inst2vec_embeddings can now be 'none' as well!
+        # this reduces the tokens that the network sees to only
+        # !IDENTIFIERs and !UNK statements
+        #  One of {zero, constant, random, random_const, finetune, none}
+        self.inst2vec_embeddings = 'random'
+
+        self.ablate_structure = None  # one of {control,data,call}
+
+    @classmethod
+    def from_dict(cls, params):
+        """instantiate Config from params dict that overrides default values where given."""
+        config = cls()
+        if params is None:
+            return config
+
+        for key in params:
+            if hasattr(config, key):
+                setattr(config, key, params[key])
+            else:
+                print(f"(*CONFIG FROM DICT*  Default {config.name} doesn't have a key {key}. Will add key to config anyway!")
+                setattr(config, key, params[key])
+        return config
+
+    def to_dict(self):
+        config_dict = {a: getattr(self, a) for a in dir(self) if not a.startswith('__') and not callable(getattr(self, a))}
+        return config_dict
+
+    def check_equal(self, other):
+        # take either config object or config_dict
+        other_dict = other if isinstance(other, dict) else other.to_dict()
+        if not self.to_dict() == other_dict:
+            print(f"WARNING: GGNNConfig.check_equal() FAILED:\nself and other are unequal: "
+                  f"The difference is {set(self.to_dict()) ^ set(other.to_dict())}.\n self={self.to_dict()}\n other={other_dict}")
+
+class GGNN_POJ104_Config(ProGraMLBaseConfig):
+    def __init__(self):
+        super().__init__()
+        ###############
+        # Model Hyperparameters
+        self.gnn_layers: int = 8
+        self.message_weight_sharing: int = 2
+        self.update_weight_sharing: int = 2
+        #self.message_timesteps: List[int] = [2, 2, 2, 2]
+        #self.update_timesteps: List[int] = [2, 2, 2, 2]
+
+        # currently only admits node types 0 and 1 for statements and identifiers.
+        self.use_node_types = True
+        self.use_edge_bias: bool = True
+        self.position_embeddings: bool = True
+
+        # Aggregate by mean or by sum
+        self.msg_mean_aggregation: bool = True
+        self.backward_edges: bool = True
+
+        ###############
+        # Regularization
+        self.edge_weight_dropout: float = 0.0
+        self.graph_state_dropout: float = 0.2
+
+        ###############
+        # Dataset inherent, don't change!
+        self.num_classes: int = 104
+        self.has_graph_labels: bool = True
+        self.has_aux_input: bool = False
+
+        # self.use_selector_embeddings: bool = False
+        # self.selector_size: int = 2 if getattr(self, 'use_selector_embeddings', False) else 0
+        # TODO(Zach) Maybe refactor non-rectangular edge passing matrices for independent hidden size.
+        # hidden size of the whole model
+        self.hidden_size: int = self.emb_size + getattr(self, 'selector_size', 0)
+
+class GGNN_Devmap_Config(GGNN_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        # change default
+        self.batch_size = 64
+        self.lr = 2.5e-4
+        self.num_epochs = 150
+        self.graph_state_dropout = 0.0
+
+        # Auxiliary Readout
+        self.aux_use_better = False
+        self.intermediate_loss_weight = 0.2
+        self.aux_in_size = 2
+        self.aux_in_layer_size = 32
+        self.aux_in_log1p = True
+
+        # Dataset inherent, don't change!
+        self.num_classes: int = 2
+        self.has_graph_labels: bool = True
+        self.has_aux_input: bool = True
+
+
+class GGNN_Threadcoarsening_Config(GGNN_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        # Dataset inherent, don't change!
+        self.num_classes: int = 6
+        self.has_graph_labels: bool = True
+        # self.has_aux_input: bool = False
+
+class GGNN_ForPretraining_Config(GGNN_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        # Pretraining Parameters
+        self.mlm_probability = 0.15
+        self.mlm_statements_only = True
+        self.mlm_exclude_unk_tokens = True
+        self.mlm_mask_token_id = 8568
+        self.unk_token_id = 8564
+
+        # set for pretraining to vocab_size + 1 [MASK]
+        self.vocab_size = self.vocab_size + 1
+        self.num_classes = self.vocab_size
+        self.has_graph_labels: bool = False
+
+
+class GraphTransformer_POJ104_Config(ProGraMLBaseConfig):
+    def __init__(self):
+        super().__init__()
+        ###### borrowed for debugging ##########
+
+        # GGNNMessage Layer
+        #self.msg_mean_aggregation: bool = True
+        #self.use_edge_bias: bool = True
+
+        ###############
+        self.backward_edges: bool = True
+        self.gnn_layers: int = 8
+        self.message_weight_sharing: int = 2
+        self.update_weight_sharing: int = 2
+        #self.layer_timesteps: List[int] = [1, 1, 1, 1, 1, 1, 1, 1] #[2, 2, 2, 2]
+        self.use_node_types: bool = False
+
+        # Dataset Specific, don't change!
+        self.num_classes: int = 104
+        self.has_graph_labels: bool = True
+        self.hidden_size: int = self.emb_size + getattr(self, 'selector_size', 0)
+
+        # Message:
+        self.position_embeddings: bool = True
+        #  Self-Attn Layer
+        self.attn_bias = True
+        self.attn_num_heads = 5 #8 # choose among 4,5,8,10 for emb_sz 200
+        self.attn_dropout = 0.1
+        self.attn_v_pos = False
+
+        # Update:
+
+        # Transformer Update Layer
+        self.update_layer: str = 'ff' # or 'gru'
+        self.tfmr_act = 'gelu' # relu or gelu, default relu
+        self.tfmr_dropout = 0.2 # default 0.1
+        self.tfmr_ff_sz = 512 #512 # ~ 2.5 model_dim (Bert: 768 - 2048, Trfm: base 512 - 2048, big 1024 - 4096)
+
+        # Optionally: GGNN Update Layer
+        #self.update_layer: str = 'gru' # or 'ff'
+        #self.edge_weight_dropout: float = 0.0
+        #self.graph_state_dropout: float = 0.2
+
+class GraphTransformer_Devmap_Config(GraphTransformer_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        # change default
+        self.batch_size = 64
+        self.lr = 2.5e-4
+        self.num_epochs = 600
+        #self.graph_state_dropout = 0.0 #GGNN only
+        
+        #self.output_dropout # <- applies to Readout func!
+
+        # Auxiliary Readout
+        self.aux_use_better = False
+        self.intermediate_loss_weight = 0.2
+        self.aux_in_size = 2
+        self.aux_in_layer_size = 32
+        self.aux_in_log1p = True
+
+        # Dataset inherent, don't change!
+        self.num_classes: int = 2
+        self.has_graph_labels: bool = True
+        self.has_aux_input: bool = True
+    
+class GraphTransformer_Threadcoarsening_Config(GraphTransformer_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        self.lr = 5e-5 #2.5-4?
+        self.num_epochs = 600
+        # Dataset inherent, don't change!
+        self.num_classes: int = 6
+        self.has_graph_labels: bool = True
+        # self.has_aux_input: bool = False
+
+class GraphTransformer_ForPretraining_Config(GraphTransformer_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        self.num_of_splits = 2
+        # Pretraining Parameters
+        self.mlm_probability = 0.15
+        self.mlm_statements_only = True
+        self.mlm_exclude_unk_tokens = True
+        self.mlm_mask_token_id = 8568
+        self.unk_token_id = 8564
+
+        # set for pretraining to vocab_size + 1 [MASK]
+        self.vocab_size = self.vocab_size + 1
+        self.num_classes = self.vocab_size
+        self.has_graph_labels: bool = False
+
+
+class GGNN_BranchPrediction_Config(GGNN_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        self.batch_size = 4
+        #self.use_tanh_readout = False !
+        self.num_classes = 1
+        self.has_graph_labels = False
+
+
+class GraphTransformer_BranchPrediction_Config(GraphTransformer_POJ104_Config):
+    def __init__(self):
+        super().__init__()
+        self.batch_size = 4
+        #self.use_tanh_readout = False !
+        self.num_classes = 1
+        self.has_graph_labels = False
diff --git a/programl/task/graph_level_classification/dataloader.py b/programl/task/graph_level_classification/dataloader.py
new file mode 100644
index 000000000..0684b1b80
--- /dev/null
+++ b/programl/task/graph_level_classification/dataloader.py
@@ -0,0 +1,105 @@
+import torch.utils.data
+from torch.utils.data.dataloader import default_collate
+
+from torch_geometric.data import Data, Batch
+from torch._six import container_abcs, string_classes, int_classes
+
+
+class DataLoader(torch.utils.data.DataLoader):
+    r"""Data loader which merges data objects from a
+    :class:`torch_geometric.data.dataset` to a mini-batch.
+
+    Args:
+        dataset (Dataset): The dataset from which to load the data.
+        batch_size (int, optional): How many samples per batch to load.
+            (default: :obj:`1`)
+        shuffle (bool, optional): If set to :obj:`True`, the data will be
+            reshuffled at every epoch. (default: :obj:`False`)
+        follow_batch (list or tuple, optional): Creates assignment batch
+            vectors for each key in the list. (default: :obj:`[]`)
+    """
+    def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[],
+                 **kwargs):
+        def collate(batch):
+            elem = batch[0]
+            if isinstance(elem, Data):
+                return Batch.from_data_list(batch, follow_batch)
+            elif isinstance(elem, float):
+                return torch.tensor(batch, dtype=torch.float)
+            elif isinstance(elem, int_classes):
+                return torch.tensor(batch)
+            elif isinstance(elem, string_classes):
+                return batch
+            elif isinstance(elem, container_abcs.Mapping):
+                return {key: collate([d[key] for d in batch]) for key in elem}
+            elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
+                return type(elem)(*(collate(s) for s in zip(*batch)))
+            elif isinstance(elem, container_abcs.Sequence):
+                return [collate(s) for s in zip(*batch)]
+
+            raise TypeError('DataLoader found invalid type: {}'.format(
+                type(elem)))
+
+        super(DataLoader,
+              self).__init__(dataset, batch_size, shuffle,
+                             collate_fn=lambda batch: collate(batch), **kwargs)
+
+
+
+class NodeLimitedDataLoader(torch.utils.data.DataLoader):
+    r"""Data loader which merges data objects from a
+    :class:`torch_geometric.data.dataset` to a mini-batch.
+
+    Args:
+        dataset (Dataset): The dataset from which to load the data.
+        batch_size (int, optional): How many samples per batch to load.
+            (default: :obj:`1`)
+        shuffle (bool, optional): If set to :obj:`True`, the data will be
+            reshuffled at every epoch. (default: :obj:`False`)
+        follow_batch (list or tuple, optional): Creates assignment batch
+            vectors for each key in the list. (default: :obj:`[]`)
+    """
+    def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[],
+                 max_num_nodes=None, warn_on_limit=False, **kwargs):
+        self.max_num_nodes = max_num_nodes
+
+        def collate(batch):
+            elem = batch[0]
+            if isinstance(elem, Data):
+                # greedily add all samples that fit within self.max_num_nodes
+                # and silently discard all others
+                if max_num_nodes is not None:
+                    num_nodes = 0
+                    limited_batch = []
+                    for elem in batch:
+                        if num_nodes + elem.num_nodes <= self.max_num_nodes:
+                            limited_batch.append(elem)
+                            num_nodes += elem.num_nodes
+                        else: # for debugging
+                            pass
+                    if len(limited_batch) < len(batch):
+                        if warn_on_limit:
+                            print(f"dropped {len(batch) - len(limited_batch)} graphs from batch!")
+                    assert limited_batch != [], f'limited batch is empty! original batch was {batch}'
+                    return Batch.from_data_list(limited_batch, follow_batch)
+                else:
+                    return Batch.from_data_list(batch, follow_batch)
+            elif isinstance(elem, float):
+                return torch.tensor(batch, dtype=torch.float)
+            elif isinstance(elem, int_classes):
+                return torch.tensor(batch)
+            elif isinstance(elem, string_classes):
+                return batch
+            elif isinstance(elem, container_abcs.Mapping):
+                return {key: collate([d[key] for d in batch]) for key in elem}
+            elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
+                return type(elem)(*(collate(s) for s in zip(*batch)))
+            elif isinstance(elem, container_abcs.Sequence):
+                return [collate(s) for s in zip(*batch)]
+
+            raise TypeError('DataLoader found invalid type: {}'.format(
+                type(elem)))
+
+        super(NodeLimitedDataLoader,
+              self).__init__(dataset, batch_size, shuffle,
+                             collate_fn=lambda batch: collate(batch), **kwargs)
diff --git a/programl/task/graph_level_classification/dataset.py b/programl/task/graph_level_classification/dataset.py
new file mode 100644
index 000000000..ed645735d
--- /dev/null
+++ b/programl/task/graph_level_classification/dataset.py
@@ -0,0 +1,1238 @@
+# better dataloader
+import csv
+import enum
+import math
+import os
+import pickle
+import subprocess
+import sys
+from pathlib import Path
+from typing import Dict, Optional
+
+import numpy as np
+import pandas as pd
+import torch
+import tqdm
+from sklearn.model_selection import KFold, StratifiedKFold
+from torch_geometric.data import Data, InMemoryDataset
+
+from programl.proto.program_graph_pb2 import ProgramGraph
+
+# make this file executable from anywhere
+
+full_path = os.path.realpath(__file__)
+#print(full_path)
+REPO_ROOT = full_path.rsplit('ProGraML', maxsplit=1)[0] + 'ProGraML'
+#print(REPO_ROOT)
+#insert at 1, 0 is the script path (or '' in REPL)
+sys.path.insert(1, REPO_ROOT)
+REPO_ROOT = Path(REPO_ROOT)
+
+
+# The vocabulary files used in the dataflow experiments.
+PROGRAML_VOCABULARY = REPO_ROOT / "deeplearning/ml4pl/poj104/programl_vocabulary.csv"
+CDFG_VOCABULARY = REPO_ROOT / "deeplearning/ml4pl/poj104/cdfg_vocabulary.csv"
+assert PROGRAML_VOCABULARY.is_file(), f"File not found: {PROGRAML_VOCABULARY}"
+assert CDFG_VOCABULARY.is_file(), f"File not found: {CDFG_VOCABULARY}"
+
+# The path of the graph2cdfg binary which converts ProGraML graphs to the CDFG
+# representation.
+#
+# To build this file, clone the ProGraML repo and build
+# //programl/cmd:graph2cdfg:
+#
+#   1.  git clone https://github.com/ChrisCummins/ProGraML.git
+#   2.  cd ProGraML
+#   3.  git checkout 2d93e5e14bf321336f1928d3364e9d7196cee995
+#   4.  bazel build -c opt //programl/cmd:graph2cdfg
+#   5.  cp -v bazel-bin/programl/cmd/graph2cdfg ${THIS_DIR}
+#
+GRAPH2CDFG = REPO_ROOT / "deeplearning/ml4pl/poj104/graph2cdfg"
+assert GRAPH2CDFG.is_file(), f"File not found: {GRAPH2CDFG}"
+
+
+def load(file: str, cdfg: bool = False) -> ProgramGraph:
+    """Read a ProgramGraph protocol buffer from file.
+
+    Args:
+        file: The path of the ProgramGraph protocol buffer to load.
+        cdfg: If true, convert the graph to CDFG during load.
+    Returns:
+        graph: the proto of the programl / CDFG graph
+        orig_graph: the original programl proto (that contains graph level labels)
+    """
+    graph = ProgramGraph()
+    with open(file, 'rb') as f:
+        proto = f.read()
+
+    
+    if cdfg:
+        # hotfix missing graph labels in cdfg proto
+        orig_graph = ProgramGraph()
+        orig_graph.ParseFromString(proto)
+
+        graph2cdfg = subprocess.Popen(
+            [str(GRAPH2CDFG), '--stdin_fmt=pb', '--stdout_fmt=pb'],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE
+        )
+        proto, _ = graph2cdfg.communicate(proto)
+        assert not graph2cdfg.returncode, f"CDFG conversion failed: {file}"
+
+    graph.ParseFromString(proto)
+    
+    if not cdfg:
+        orig_graph = graph    
+    return graph, orig_graph
+
+
+def load_vocabulary(path: Path):
+    """Read the vocabulary file used in the dataflow experiments."""
+    vocab = {}
+    with open(path) as f:
+        vocab_file = csv.reader(f.readlines(), delimiter="\t")
+        for i, row in enumerate(vocab_file, start=-1):
+            if i == -1:  # Skip the header.
+                continue
+            (_, _, _, text) = row
+            vocab[text] = i
+
+    return vocab
+
+
+class AblationVocab(enum.IntEnum):
+    # No ablation - use the full vocabulary (default).
+    NONE = 0
+    # Ignore the vocabulary - every node has an x value of 0.
+    NO_VOCAB = 1
+    # Use a 3-element vocabulary based on the node type:
+    #    0 - Instruction node
+    #    1 - Variable node
+    #    2 - Constant node
+    NODE_TYPE_ONLY = 2
+
+
+def filename(
+        split: str,
+        cdfg: bool = False,
+        ablation_vocab: AblationVocab = AblationVocab.NONE
+    ) -> str:
+    """Generate the name for a data file.
+
+    Args:
+        split: The name of the split.
+        cdfg: Whether using CDFG representation.
+        ablate_vocab: The ablation vocab type.
+
+    Returns:
+        A file name which uniquely identifies this combination of
+        split/cdfg/ablation.
+    """
+    name = str(split)
+    if cdfg:
+        name = f"{name}_cdfg"
+    if ablation_vocab != AblationVocab.NONE:
+        # transform if ablation_vocab was passed as int.
+        if type(ablation_vocab) == int:
+            ablation_vocab = AblationVocab(ablation_vocab)
+        name = f"{name}_{ablation_vocab.name.lower()}"
+    return f"{name}_data.pt"
+
+
+def nx2data(graph: ProgramGraph,
+            vocabulary: Dict[str, int],
+            y_feature_name: Optional[str] = None,
+            ignore_profile_info=True,
+            ablate_vocab = AblationVocab.NONE,
+            orig_graph: ProgramGraph = None):
+    r"""Converts a program graph protocol buffer to a
+    :class:`torch_geometric.data.Data` instance.
+
+    Args:
+        graph           A program graph protocol buffer.
+        vocabulary      A map from node text to vocabulary indices.
+        y_feature_name  The name of the graph-level feature to use as class label.
+        ablate_vocab    Whether to use an ablation vocabulary.
+        orig_graph      A program graph protocol buffer that has graph level labels.
+    """
+
+    # collect edge_index
+    edge_tuples = [(edge.source, edge.target) for edge in graph.edge]
+    edge_index = torch.tensor(edge_tuples).t().contiguous()
+
+    # collect edge_attr
+    positions = torch.tensor([edge.position for edge in graph.edge])
+    flows = torch.tensor([int(edge.flow) for edge in graph.edge])
+
+    edge_attr = torch.cat([flows, positions]).view(2, -1).t().contiguous()
+
+    # collect x
+    if ablate_vocab == AblationVocab.NONE:
+        vocabulary_indices = vocab_ids = [
+            vocabulary.get(node.text, len(vocabulary))
+            for node in graph.node
+        ]
+    elif ablate_vocab == AblationVocab.NO_VOCAB:
+        vocabulary_indices = [0] * len(graph.node)
+    elif ablate_vocab == AblationVocab.NODE_TYPE_ONLY:
+        vocabulary_indices = [int(node.type) for node in graph.node]
+    else:
+        raise NotImplementedError("unreachable")
+
+    xs = torch.tensor(vocabulary_indices)
+    types = torch.tensor([int(node.type) for node in graph.node])
+
+    x = torch.cat([xs, types]).view(2, -1).t().contiguous()
+
+    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
+
+    data_dict = {
+        'x': x,
+        'edge_index': edge_index,
+        'edge_attr': edge_attr,
+    }
+    
+    # maybe collect these data too
+    if y_feature_name is not None:
+        assert orig_graph is not None, "need orig_graph to retrieve graph level labels!"
+        y = torch.tensor(orig_graph.features.feature[y_feature_name].int64_list.value[0]).view(1)  # <1>
+        if y_feature_name == "poj104_label":
+            y -= 1
+        data_dict['y'] = y
+    
+    # branch prediction / profile info specific
+    if not ignore_profile_info:
+        raise NotImplementedError("profile info is not supported with the new nx2data (from programgraph) adaptation.")
+        profile_info = []
+        for i, node_data in nx_graph.nodes(data=True):
+            # default to -1, -1, -1 if not all profile info is given.
+            if not (node_data.get("llvm_profile_true_weight") is not None and \
+                    node_data.get("llvm_profile_false_weight") is not None and \
+                    node_data.get("llvm_profile_total_weight") is not None):
+                mask = 0
+                true_weight = -1
+                false_weight = -1
+                total_weight = -1
+            else:
+                mask = 1
+                true_weight = node_data["llvm_profile_true_weight"]
+                false_weight = node_data["llvm_profile_false_weight"]
+                total_weight = node_data["llvm_profile_total_weight"]
+
+            profile_info.append([mask, true_weight, false_weight, total_weight])
+        
+        data_dict['profile_info'] = torch.tensor(profile_info)
+    
+    
+    # make Data
+    data = Data(**data_dict)
+
+    return data
+
+
+class BranchPredictionDataset(InMemoryDataset):
+    def __init__(self, root='deeplearning/ml4pl/poj104/branch_prediction_data',
+                 split='train',
+                 transform=None, pre_transform=None,
+                 train_subset=[0, 100],
+                 train_subset_seed=0):
+        """
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+        """
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        super().__init__(root, transform, pre_transform)
+
+        assert split in ['train'], "The BranchPrediction dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
+        self.data, self.slices = torch.load(self.processed_paths[0])
+        pass
+
+    @property
+    def raw_file_names(self):
+        """A list of files that need to be found in the raw_dir in order to skip the download"""
+        return []  # not implemented here
+
+    @property
+    def processed_file_names(self):
+        """A list of files in the processed_dir which needs to be found in order to skip the processing."""
+        base = f'{self.split}_data.pt'
+
+        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+            return [base]
+        else:
+            assert self.split == 'train'
+            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        """Download raw data to `self.raw_dir`"""
+        pass  # not implemented
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        import numpy as np
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(30, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def return_cross_validation_splits(self, split):
+        assert self.train_subset == [0, 100], "Do cross-validation on the whole dataset!"
+        #num_samples = len(self)
+        #perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+         # 10-fold cross-validation
+        n_splits = 10
+        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
+        (train_index, test_index) = list(kf.split(range(len(self))))[split]
+        train_data = self.__indexing__(train_index)
+        test_data = self.__indexing__(test_index)
+        return train_data, test_data
+
+    def filter_max_num_nodes(self, max_num_nodes):
+        idx = []
+        for i, d in enumerate(self):
+            if d.num_nodes <= max_num_nodes:
+                idx.append(i)
+        dataset = self.__indexing__(idx)
+        print(f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining.")
+        return dataset
+
+    def process(self):
+        """Processes raw data and saves it into the `processed_dir`.
+        New implementation:
+            Here specifically it will collect all '*.ll.pickle' files recursively from subdirectories of `root`
+            and process the loaded nx graphs to Data.
+        Old implementation:
+            Instead of looking for .ll.pickle (nx graphs), we directly look for '*.data.p' files.
+        """
+        # check if we need to create the full dataset:
+        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        if full_dataset.is_file():
+            assert self.split == 'train', 'here shouldnt be reachable.'
+            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+
+        # read data into huge `Data` list.
+        data_list = []
+
+        ds_base = Path(self.root)
+        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        # TODO change this line to go to the new format
+        #out_base = ds_base / ('ir_' + self.split + '_programl')
+        #assert out_base.exists(), f"{out_base} doesn't exist!"
+        # TODO collect .ll.pickle instead and call nx2data on the fly!
+        print(f"=== DATASET {str(ds_base)}: Collecting .data.p files into dataset")
+
+        #files = list(ds_base.rglob('*.data.p'))
+        #files = list(ds_base.rglob('*.ll.pickle'))
+        files = list(ds_base.rglob('*.ll.p'))
+        
+        for file in tqdm.tqdm(files):
+            if not file.is_file():
+                continue
+            try:
+                nx_graph = load(file)
+            except EOFError:
+                print(f"Failing to unpickle bc. EOFError on {file}! Skipping ...")
+                continue
+            try:
+                data = nx2data(nx_graph, ignore_profile_info=False)
+                data_list.append(data)
+            except IndexError:
+                print(f"Failing nx2data bc IndexError (prob. empty graph) on {file}! Skipping ...")
+                continue
+
+        print(f" * COMPLETED * === DATASET {ds_base}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        print(f" * COMPLETED * === DATASET {ds_base}: saving to disk...")
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+            self._save_train_subset()
+
+
+
+class NCCDataset(InMemoryDataset):
+    def __init__(self, root=REPO_ROOT / 'deeplearning/ml4pl/poj104/ncc_data',
+                 split='train',
+                 transform=None, pre_transform=None,
+                 train_subset=[0, 100],
+                 train_subset_seed=0):
+        """
+        NCC dataset
+
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+
+        """
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        super().__init__(root, transform, pre_transform)
+
+        assert split in ['train'], "The NCC dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        """A list of files that need to be found in the raw_dir in order to skip the download"""
+        return []  # not implemented here
+
+    @property
+    def processed_file_names(self):
+        """A list of files in the processed_dir which needs to be found in order to skip the processing."""
+        base = f'{self.split}_data.pt'
+
+        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+            return [base]
+        else:
+            assert self.split == 'train'
+            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        """Download raw data to `self.raw_dir`"""
+        pass  # not implemented
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        import numpy as np
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(30, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def filter_max_num_nodes(self, max_num_nodes):
+        idx = []
+        for i, d in enumerate(self):
+            if d.num_nodes <= max_num_nodes:
+                idx.append(i)
+        dataset = self.__indexing__(idx)
+        print(f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining.")
+        return dataset
+
+    def process(self):
+        """Processes raw data and saves it into the `processed_dir`.
+        New implementation:
+            Here specifically it will collect all '*.ll.pickle' files recursively from subdirectories of `root`
+            and process the loaded nx graphs to Data.
+        Old implementation:
+            Instead of looking for .ll.pickle (nx graphs), we directly look for '*.data.p' files.
+        """
+        # check if we need to create the full dataset:
+        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        if full_dataset.is_file():
+            assert self.split == 'train', 'here shouldnt be reachable.'
+            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+
+        # read data into huge `Data` list.
+        data_list = []
+
+        ds_base = Path(self.root)
+        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        # TODO change this line to go to the new format
+        #out_base = ds_base / ('ir_' + self.split + '_programl')
+        #assert out_base.exists(), f"{out_base} doesn't exist!"
+        # TODO collect .ll.pickle instead and call nx2data on the fly!
+        print(f"=== DATASET {str(ds_base)}: Collecting .data.p files into dataset")
+
+        #files = list(ds_base.rglob('*.data.p'))
+        #files = list(ds_base.rglob('*.ll.pickle'))
+        files = list(ds_base.rglob('*.ll.p'))
+        
+        for file in tqdm.tqdm(files):
+            if not file.is_file():
+                continue
+            try:
+                nx_graph = load(file)
+            except EOFError:
+                print(f"Failing to unpickle bc. EOFError on {file}! Skipping ...")
+                continue
+            try:
+                data = nx2data(nx_graph)
+                data_list.append(data)
+            except IndexError:
+                print(f"Failing nx2data bc IndexError (prob. empty graph) on {file}! Skipping ...")
+                continue
+
+        print(f" * COMPLETED * === DATASET {ds_base}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        print(f" * COMPLETED * === DATASET {ds_base}: saving to disk...")
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+            self._save_train_subset()
+
+
+
+
+class LegacyNCCDataset(InMemoryDataset):
+    def __init__(self, root='deeplearning/ml4pl/poj104/unsupervised_ncc_data',
+                 split='train',
+                 transform=None, pre_transform=None,
+                 train_subset=[0, 100],
+                 train_subset_seed=0):
+        """
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+
+        """
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        super().__init__(root, transform, pre_transform)
+
+        assert split in ['train'], "The NCC dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        """A list of files that need to be found in the raw_dir in order to skip the download"""
+        return []  # not implemented here
+
+    @property
+    def processed_file_names(self):
+        """A list of files in the processed_dir which needs to be found in order to skip the processing."""
+        base = f'{self.split}_data.pt'
+
+        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+            return [base]
+        else:
+            assert self.split == 'train'
+            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        """Download raw data to `self.raw_dir`"""
+        pass  # not implemented
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        import numpy as np
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(30, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def filter_max_num_nodes(self, max_num_nodes):
+        idx = []
+        for i, d in enumerate(self):
+            if d.num_nodes <= max_num_nodes:
+                idx.append(i)
+        dataset = self.__indexing__(idx)
+        print(f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining.")
+        return dataset
+
+    def process(self):
+        """Processes raw data and saves it into the `processed_dir`.
+        New implementation:
+            Here specifically it will collect all '*.ll.pickle' files recursively from subdirectories of `root`
+            and process the loaded nx graphs to Data.
+        Old implementation:
+            Instead of looking for .ll.pickle (nx graphs), we directly look for '*.data.p' files.
+        """
+        # check if we need to create the full dataset:
+        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        if full_dataset.is_file():
+            assert self.split == 'train', 'here shouldnt be reachable.'
+            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+
+        # read data into huge `Data` list.
+        data_list = []
+
+        ds_base = Path(self.root)
+        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        # TODO change this line to go to the new format
+        #out_base = ds_base / ('ir_' + self.split + '_programl')
+        #assert out_base.exists(), f"{out_base} doesn't exist!"
+        # TODO collect .ll.pickle instead and call nx2data on the fly!
+        print(f"=== DATASET {str(ds_base)}: Collecting .data.p files into dataset")
+
+        files = list(ds_base.rglob('*.data.p'))
+        for file in tqdm.tqdm(files):
+            if not file.is_file():
+                continue
+            data = load(file)
+            data_list.append(data)
+
+        print(f" * COMPLETED * === DATASET {ds_base}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        print(f" * COMPLETED * === DATASET {ds_base}: saving to disk...")
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+            self._save_train_subset()
+
+
+
+class ThreadcoarseningDataset(InMemoryDataset):
+    def __init__(self, root='deeplearning/ml4pl/poj104/threadcoarsening_data',
+                 split='fail_fast',
+                 transform=None, pre_transform=None,
+                 train_subset=[0, 100], train_subset_seed=0):
+        """
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+            split: 'amd' or 'nvidia'
+
+        """
+        assert split in ["Cypress", "Tahiti", "Fermi", "Kepler"], f"Split is {split}, but has to be 'Cypress', 'Tahiti', 'Fermi', or  'Kepler'"
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        super().__init__(root, transform, pre_transform)
+
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        return 'threadcoarsening_data.zip'
+
+    @property
+    def processed_file_names(self):
+        base = f'{self.split}_data.pt'
+
+        if tuple(self.train_subset) == (0, 100):
+            return [base]
+        else:
+            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        # download to self.raw_dir
+        pass
+
+    def return_cross_validation_splits(self, split):
+        assert self.train_subset == [0, 100], "Do cross-validation on the whole dataset!"
+        assert split <= 16 and split >= 0, f"This dataset shall be 17-fold (leave one out) cross-validated, but split={split}."
+        # leave one out
+        n_splits = 17
+        train_idx = list(range(n_splits))
+        train_idx.remove(split)
+        train_data = self.__indexing__(train_idx)
+        test_data = self.__indexing__([split])
+        return train_data, test_data
+
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        import numpy as np
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def platform2str(self, platform):
+        if platform == "Fermi":
+            return "NVIDIA GTX 480"
+        elif platform == "Kepler":
+            return "NVIDIA Tesla K20c"
+        elif platform == "Cypress":
+            return "AMD Radeon HD 5900"
+        elif platform == "Tahiti":
+            return "AMD Tahiti 7970"
+        else:
+            raise LookupError
+
+    def _get_all_runtimes(self, platform, df, oracles):
+        all_runtimes = {}
+        for kernel in oracles['kernel']:
+            kernel_r = []
+            for cf in [1, 2, 4, 8, 16, 32]:
+                row = df[(df['kernel'] == kernel) & (df['cf'] == cf)]
+                if len(row) == 1:
+                    value = float(row[f'runtime_{platform}'].values)
+                    if math.isnan(value):
+                        print(f"WARNING: Dataset contain NaN value (missing entry in runtimes most likely). kernel={kernel}, cf={cf}, value={row}.Replacing by infinity!.")
+                        value = float('inf')
+                    kernel_r.append(value)
+                elif len(row) == 0:
+                    print(f' kernel={kernel:>20} is missing cf={cf}. Ad-hoc inserting result from last existing coarsening factor.')
+                    kernel_r.append(kernel_r[-1])
+                else:
+                    raise
+            all_runtimes[kernel] = kernel_r
+        return all_runtimes
+
+    def process(self):
+        # check if we need to create the full dataset:
+        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        if full_dataset.is_file():
+            print(f"Full dataset {full_dataset.name} found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+
+        root = Path(self.root)
+        # Load runtime data
+        oracle_file = root / "pact-2014-oracles.csv"
+        oracles = pd.read_csv(oracle_file)
+
+        runtimes_file = root / "pact-2014-runtimes.csv"
+        df = pd.read_csv(runtimes_file)
+
+        print('\tReading data from', oracle_file, '\n\tand', runtimes_file)
+
+        # get all runtime info per kernel
+        runtimes = self._get_all_runtimes(self.split, df=df, oracles=oracles)
+
+        # get oracle labels
+        cfs = [1, 2, 4, 8, 16, 32]
+        y = np.array([cfs.index(int(x)) for x in oracles["cf_" + self.split]], dtype=np.int64)
+
+        # sanity check oracles against min runtimes
+        for i, (k, v) in enumerate(runtimes.items()):
+            assert int(y[i]) == np.argmin(v), f"{i}: {k} {v}, argmin(v): {np.argmin(v)} vs. oracles data {int(y[i])}."
+
+        # Add attributes to graphs
+        data_list = []
+
+        kernels = oracles["kernel"].values  # list of strings of kernel names
+
+        for kernel in kernels:
+            #legacy
+            #file = root / 'kernels_ir_programl' / (kernel + '.data.p')
+            file = root / 'kernels_ir' / (kernel + '.ll.p')
+            assert file.exists(), f'input file not found: {file}'
+            #with open(file, 'rb') as f:
+            #    data = pickle.load(f)
+            g = load(file)
+            data = nx2data(g)
+            # add attributes
+            data['y'] = torch.tensor([np.argmin(runtimes[kernel])], dtype=torch.long)
+            data['runtimes'] = torch.tensor([runtimes[kernel]])
+            data_list.append(data)
+
+        ##################################
+
+        print(f" * COMPLETED * === DATASET Threadcoarsening-{self.split}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET Threadcoarsening-{self.split}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+            self._save_train_subset()
+
+
+
+class DevmapDataset(InMemoryDataset):
+    def __init__(self, root='deeplearning/ml4pl/poj104/devmap_data',
+                 split='fail', transform=None, pre_transform=None,
+                 train_subset=[0, 100], train_subset_seed=0, cdfg: bool = False,
+                 ablation_vocab: AblationVocab = AblationVocab.NONE):
+        """
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+            split: 'amd' or 'nvidia'
+            cdfg: Use CDFG graph representation.
+        """
+        assert split in ['amd', 'nvidia'], f"Split is {split}, but has to be 'amd' or 'nvidia'"
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        self.cdfg = cdfg
+        self.ablation_vocab = ablation_vocab
+        super().__init__(root, transform, pre_transform)
+
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        return 'devmap_data.zip'
+
+    @property
+    def processed_file_names(self):
+        base = filename(self.split, self.cdfg, self.ablation_vocab)
+
+        if tuple(self.train_subset) == (0, 100):
+            return [base]
+        else:
+            return [f'{name}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        # download to self.raw_dir
+        pass
+
+    def return_cross_validation_splits(self, split):
+        assert self.train_subset == [0, 100], "Do cross-validation on the whole dataset!"
+        #num_samples = len(self)
+        #perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+         # 10-fold cross-validation
+        n_splits = 10
+        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
+        (train_index, test_index) = list(kf.split(self.data.y, self.data.y))[split]
+        train_data = self.__indexing__(train_index)
+        test_data = self.__indexing__(test_index)
+        return train_data, test_data
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def process(self):
+        # check if we need to create the full dataset:
+        name = filename(self.split, self.cdfg, self.ablation_vocab)
+        full_dataset = Path(self.processed_dir) / name
+        if full_dataset.is_file():
+            print(f"Full dataset {full_dataset.name} found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+
+
+        vocab = load_vocabulary(CDFG_VOCABULARY if self.cdfg else PROGRAML_VOCABULARY)
+        assert len(vocab) > 0, "vocab is empty :|"
+
+        root = Path(self.root)
+
+        # Get list of source file names and attributes
+        input_files = list((root / f"graphs_{self.split}").iterdir())
+
+        num_files = len(input_files)
+        print('\n--- Preparing to read', num_files, 'input files')
+
+        # read data into huge `Data` list.
+
+        data_list = []
+        for i in tqdm.tqdm(range(num_files)):
+            filename = input_files[i]
+
+            proto, _ = load(filename, cdfg=self.cdfg)
+            data = nx2data(proto, vocabulary=vocab, ablate_vocab=self.ablation_vocab)
+
+            # graph2cdfg conversion drops the graph features, so we may have to 
+            # reload the graph.
+            if self.cdfg:
+                proto = load(filename)
+
+            # Add the features and label.
+            proto_features = proto.features.feature
+            data['y'] = torch.tensor(proto_features["devmap_label"].int64_list.value[0]).view(1)
+            data['aux_in'] = torch.tensor([
+                proto_features["transfer_bytes"].int64_list.value[0],
+                proto_features["wgsize"].int64_list.value[0],
+            ])
+
+            data_list.append(data)
+
+        ##################################
+
+        print(f" * COMPLETED * === DATASET Devmap-{name}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET Devmap-{name}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100):
+            self._save_train_subset()
+
+
+
+
+class POJ104Dataset(InMemoryDataset):
+    def __init__(self, root='deeplearning/ml4pl/poj104/classifyapp_data',
+                 split='fail',
+                 transform=None, pre_transform=None,
+                 train_subset=[0, 100], train_subset_seed=0,
+                 cdfg: bool = False,
+                 ablation_vocab: AblationVocab = AblationVocab.NONE):
+        """
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+            cdfg: Use the CDFG graph format and vocabulary.
+        """
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        self.cdfg = cdfg
+        self.ablation_vocab = ablation_vocab
+        super().__init__(root, transform, pre_transform)
+
+        assert split in ['train', 'val', 'test']
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        return 'classifyapp_data.zip' #['ir_val', 'ir_val_programl']
+
+    @property
+    def processed_file_names(self):
+        base = filename(self.split, self.cdfg, self.ablation_vocab)
+
+        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+            return [base]
+        else:
+            assert self.split == 'train'
+            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        # download to self.raw_dir
+        pass
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        import numpy as np
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def process(self):
+        # hardcoded
+        num_classes = 104
+
+        # check if we need to create the full dataset:
+        full_dataset = Path(self.processed_dir) / filename(self.split, self.cdfg, self.ablation_vocab)
+        if full_dataset.is_file():
+            assert self.split == 'train', 'here shouldnt be reachable.'
+            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+        
+        # get vocab first
+        vocab = load_vocabulary(CDFG_VOCABULARY if self.cdfg else PROGRAML_VOCABULARY)
+        assert len(vocab) > 0, "vocab is empty :|"
+        # read data into huge `Data` list.
+        data_list = []
+
+        ds_base = Path(self.root)
+        print(f'Creating {self.split} dataset at {str(ds_base)}')
+
+        split_folder = ds_base / (self.split)
+        assert split_folder.exists(), f"{split_folder} doesn't exist!"
+        
+        # collect .pb and call nx2data on the fly!
+        print(f"=== DATASET {split_folder}: Collecting ProgramGraph.pb files into dataset")
+
+        # only take files from subfolders (with class names!) recursively
+        files = [x for x in split_folder.rglob("*ProgramGraph.pb")]
+        assert len(files) > 0, "no files collected. error."
+        for file in tqdm.tqdm(files):
+            # skip classes that are larger than what config says to enable debugging with less data
+            #class_label = int(file.parent.name) - 1  # let classes start from 0.
+            #if class_label >= num_classes:
+            #    continue
+
+            g, orig_graph = load(file, cdfg=self.cdfg)
+            data = nx2data(graph=g,
+                           vocabulary=vocab,
+                           ablate_vocab=self.ablation_vocab,
+                           y_feature_name="poj104_label",
+                           orig_graph=orig_graph)
+            data_list.append(data)
+
+        print(f" * COMPLETED * === DATASET {split_folder}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET {split_folder}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+            self._save_train_subset()
+
+
+
+
+class LegacyPOJ104Dataset(InMemoryDataset):
+    def __init__(self, root='deeplearning/ml4pl/poj104/classifyapp_data',
+                 split='fail',
+                 transform=None, pre_transform=None,
+                 train_subset=[0, 100], train_subset_seed=0):
+        """
+        Args:
+            train_subset: [start_percentile, stop_percentile)    default [0,100).
+                            sample a random (but fixed) train set of data in slice by percent, with given seed.
+            train_subset_seed: seed for the train_subset fixed random permutation.
+
+        """
+        self.split = split
+        self.train_subset = train_subset
+        self.train_subset_seed = train_subset_seed
+        super().__init__(root, transform, pre_transform)
+
+        assert split in ['train', 'val', 'test']
+        self.data, self.slices = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        return 'classifyapp_data.zip' #['ir_val', 'ir_val_programl']
+
+    @property
+    def processed_file_names(self):
+        base = f'{self.split}_data.pt'
+
+        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+            return [base]
+        else:
+            assert self.split == 'train'
+            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+
+    def download(self):
+        # download to self.raw_dir
+        pass
+
+    def _save_train_subset(self):
+        """saves a train_subset of self to file.
+        Percentile slice is taken according to self.train_subset
+        with a fixed random permutation with self.train_subset_seed.
+        """
+        import numpy as np
+        perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # take slice of perm according to self.train_subset
+        start = np.math.floor(len(self) / 100 * self.train_subset[0])
+        stop = np.math.floor(len(self) / 100 * self.train_subset[1])
+        perm = perm[start:stop]
+        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+
+        dataset = self.__indexing__(perm)
+
+        data, slices = dataset.data, dataset.slices
+        torch.save((data, slices), self.processed_paths[0])
+        return
+
+    def process(self):
+        # hardcoded
+        num_classes = 104
+
+        # check if we need to create the full dataset:
+        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        if full_dataset.is_file():
+            assert self.split == 'train', 'here shouldnt be reachable.'
+            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            # just get the split and save it
+            self.data, self.slices = torch.load(full_dataset)
+            self._save_train_subset()
+            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            return
+
+        # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
+        assert not full_dataset.is_file(), 'shouldnt be'
+        processed_path = str(full_dataset)
+
+        # read data into huge `Data` list.
+        data_list = []
+
+        ds_base = Path(self.root)
+        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        # TODO change this line to go to the new format
+        out_base = ds_base / ('ir_' + self.split + '_programl')
+        assert out_base.exists(), f"{out_base} doesn't exist!"
+        # TODO collect .ll.pickle instead and call nx2data on the fly!
+        print(f"=== DATASET {out_base}: Collecting .data.p files into dataset")
+
+        folders = [x for x in out_base.glob("*") if x.is_dir() and x.name not in ['_nx', '_tuples']]
+        for folder in tqdm.tqdm(folders):
+            # skip classes that are larger than what config says to enable debugging with less data
+            if int(folder.name) > num_classes:
+                continue
+            for k, file in enumerate(folder.glob("*.data.p")):
+                with open(file, "rb") as f:
+                    data = pickle.load(f)
+                data_list.append(data)
+
+        print(f" * COMPLETED * === DATASET {out_base}: now pre-filtering...")
+
+        if self.pre_filter is not None:
+            data_list = [d for d in data_list if self.pre_filter(d)]
+        print(f" * COMPLETED * === DATASET {out_base}: Completed filtering, now pre_transforming...")
+
+        if self.pre_transform is not None:
+            data_list = [self.pre_transform(d) for d in data_list]
+
+        self.data, self.slices = self.collate(data_list)
+        torch.save((self.data, self.slices), processed_path)
+
+        # maybe save train_subset as well
+        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+            self._save_train_subset()
+
+
+if __name__ == '__main__':
+    #d = NewNCCDataset()
+    #print(d.data)
+    root = '/home/zacharias/llvm_datasets/threadcoarsening_data/'
+    a = ThreadcoarseningDataset(root, 'Cypress')
+    b = ThreadcoarseningDataset(root, 'Tahiti')
+    c = ThreadcoarseningDataset(root, 'Fermi')
+    d = ThreadcoarseningDataset(root, 'Kepler')
diff --git a/programl/task/graph_level_classification/modeling.py b/programl/task/graph_level_classification/modeling.py
new file mode 100644
index 000000000..3bc94e79d
--- /dev/null
+++ b/programl/task/graph_level_classification/modeling.py
@@ -0,0 +1,1382 @@
+# Copyright 2019 the ProGraML authors.
+#
+# Contact Chris Cummins <chrisc.101@gmail.com>.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modules that make up the pytorch GNN models."""
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import optim
+
+# Dependency moved into SelfAttention Message Layer
+from torch_geometric.utils import softmax as scatter_softmax
+
+
+SMALL_NUMBER = 1e-8
+
+def print_state_dict(mod):
+    for n, t in mod.state_dict().items():
+        print(n, t.size())
+
+def num_parameters(mod) -> int:
+    """Compute the number of trainable parameters in a nn.Module and its children.
+    OBS:
+        This function misses some parameters, i.e. in pytorch's official MultiheadAttention layer,
+        while the state dict doesn't miss any!
+    """
+    num_params = sum(param.numel() for param in mod.parameters(recurse=True) if param.requires_grad)
+    return f"{num_params:,} params, weights size: {num_params * 4 / 1e6:.3f}MB."
+
+def assert_no_nan(tensor_list):
+    for i, t in enumerate(tensor_list):
+        assert not torch.isnan(t).any(), f"{i}: {tensor_list}"
+
+
+################################################
+# Main Model classes
+################################################
+class BaseGNNModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def setup(self, config, test_only):
+        self.loss = Loss(config)
+        # move model to device before making optimizer!
+        self.dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+        self.to(self.dev)
+        print(f"Moved model to {self.dev}")
+
+        if test_only:
+            self.opt = None
+            self.eval()
+        else:
+            self.opt = self.get_optimizer(self.config)
+
+    def get_optimizer(self, config):
+        return optim.AdamW(self.parameters(), lr=config.lr)
+
+    def num_parameters(self) -> int:
+        """Compute the number of trainable parameters in this nn.Module and its children."""
+        num_params = sum(param.numel() for param in self.parameters(recurse=True) if param.requires_grad)
+        return f"{num_params:,} params, weights size: ~{num_params * 4 // 1e6:,}MB."
+
+    def forward(self, vocab_ids, labels, edge_lists, selector_ids=None,
+                pos_lists=None, num_graphs=None, graph_nodes_list=None,
+                node_types=None, aux_in=None, test_time_steps=None, readout_mask=None, runtimes=None,
+        ):
+        # Input
+        # selector_ids are ignored anyway by the NodeEmbeddings module that doesn't support them.
+        raw_in = self.node_embeddings(vocab_ids, selector_ids)
+
+        # GNN
+        raw_out, raw_in, *unroll_stats = self.gnn(
+            edge_lists, raw_in, pos_lists, node_types
+        )  # OBS! self.gnn might change raw_in inplace, so use the two outputs
+        # instead!
+
+        # Readout
+        if getattr(self.config, 'has_graph_labels', False):
+            assert graph_nodes_list is not None and num_graphs is not None, 'has_graph_labels requires graph_nodes_list and num_graphs tensors.'
+            nodewise_readout, graphwise_readout = self.readout(
+                raw_in,
+                raw_out,
+                graph_nodes_list=graph_nodes_list,
+                num_graphs=num_graphs,
+                auxiliary_features=aux_in,
+                readout_mask=readout_mask,
+            )
+            logits = graphwise_readout
+        else:  # nodewise only
+            nodewise_readout, _ = self.readout(raw_in, raw_out, readout_mask=readout_mask)
+            graphwise_readout = None
+            logits = nodewise_readout
+
+        # do the old style aux_readout if not aux_use_better is set
+        if getattr(self.config, 'has_aux_input', False) and not getattr(self.config, 'aux_use_better', False):
+            assert self.config.has_graph_labels is True, \
+                "Implementation hasn't been checked for use with aux_input and nodewise prediction! It could work or fail silently."
+            assert aux_in is not None
+            logits, graphwise_readout = self.aux_readout(logits, aux_in)
+
+
+        if readout_mask is not None:  # need to mask labels in the same fashion.
+            assert readout_mask.dtype == torch.bool, 'Readout mask should be boolean!'
+            labels = labels[readout_mask]
+
+        # Metrics
+        # accuracy, correct?, targets, maybe runtimes: actual, optimal
+        metrics_tuple = self.metrics(logits, labels, runtimes)
+
+        outputs = (
+            (logits,) + metrics_tuple + (graphwise_readout,) + tuple(unroll_stats)
+        )
+
+        return outputs
+
+
+class GraphTransformerModel(BaseGNNModel):
+    """Transformer Encoder for Graphs."""
+    def __init__(self, config, pretrained_embeddings=None, test_only=False):
+        super().__init__()
+        self.config = config
+        self.node_embeddings = NodeEmbeddings(config)
+        self.gnn = GraphTransformerEncoder(config)
+
+
+        # get readout and maybe tack on the aux readout
+        self.has_aux_input = getattr(self.config, "has_aux_input", False)
+        self.aux_use_better = getattr(self.config, 'aux_use_better', False)
+        
+        if self.has_aux_input and self.aux_use_better:
+            self.readout = BetterAuxiliaryReadout(config)
+        elif self.has_aux_input:
+            self.readout = Readout(config)
+            self.aux_readout = AuxiliaryReadout(config)
+        else:
+            assert not self.aux_use_better, 'aux_use_better only with has_aux_input!'
+            self.readout = Readout(config)
+
+        self.metrics = Metrics()
+
+        self.setup(config, test_only)
+        print(self)
+        print(f"Number of trainable params in GraphTransformerModel: {self.num_parameters()}")
+
+
+class GGNNModel(BaseGNNModel):
+    def __init__(self, config, pretrained_embeddings=None, test_only=False):
+        super().__init__()
+        self.config = config
+
+        # input layer
+        if getattr(config, 'use_selector_embeddings', False):
+            self.node_embeddings = NodeEmbeddingsWithSelectors(config, pretrained_embeddings)
+        else:
+            self.node_embeddings = NodeEmbeddings(config, pretrained_embeddings)
+
+
+        # Readout layer
+        # get readout and maybe tack on the aux readout
+        self.has_aux_input = getattr(self.config, "has_aux_input", False)
+        self.aux_use_better = getattr(self.config, 'aux_use_better', False)
+        if self.has_aux_input and self.aux_use_better:
+            self.readout = BetterAuxiliaryReadout(config)
+        elif self.has_aux_input:
+            self.readout = Readout(config)
+            self.aux_readout = AuxiliaryReadout(config)
+        else:
+            assert not self.aux_use_better, 'aux_use_better only with has_aux_input!'
+            self.readout = Readout(config)
+
+        # GNN
+        # make readout available to label_convergence tests in GGNN Proper (at runtime)
+        self.gnn = GGNNEncoder(config, readout=self.readout)
+
+        # eval and training
+        self.metrics = Metrics()
+
+        self.setup(config, test_only)
+        print(self)
+        print(f"Number of trainable params in GGNNModel: {self.num_parameters()}")
+
+################################################
+# GNN Encoder: Message+Aggregate, Update
+################################################
+
+# GNN Encoder, i.e. everything between input and readout.
+# Will rely on the different msg+aggr and update modules to build up a GNN.
+
+
+class GGNNEncoder(nn.Module):
+    def __init__(self, config, readout=None):
+        super().__init__()
+        self.backward_edges = config.backward_edges
+
+        self.gnn_layers = config.gnn_layers
+        self.message_weight_sharing = config.message_weight_sharing
+        self.update_weight_sharing = config.update_weight_sharing
+        message_layers = self.gnn_layers // self.message_weight_sharing
+        update_layers = self.gnn_layers // self.update_weight_sharing
+        assert message_layers * self.message_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
+        assert update_layers * self.update_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
+        #self.layer_timesteps = config.layer_timesteps
+
+        self.position_embeddings = config.position_embeddings
+
+        # optional eval time unrolling parameter
+        self.test_layer_timesteps = getattr(config, 'test_layer_timesteps', 0)
+        self.unroll_strategy = getattr(config, 'unroll_strategy', 'none')
+        self.max_timesteps = getattr(config, 'max_timesteps', 1000)
+        self.label_conv_threshold = getattr(config, 'label_conv_threshold', 0.995)
+        self.label_conv_stable_steps = getattr(config, 'label_conv_stable_steps', 1)
+
+        # make readout avalable for label_convergence tests
+        if self.unroll_strategy == "label_convergence":
+            assert not self.config.has_aux_input, "aux_input is not supported with label_convergence"
+            assert readout, "Gotta pass instantiated readout module for label_convergence tests!"
+            self.readout = readout
+
+        # Message and update layers
+        self.message = nn.ModuleList()
+        #for i in range(len(self.layer_timesteps)):§
+        for i in range(message_layers):
+            self.message.append(GGNNMessageLayer(config))
+
+        self.update = nn.ModuleList()
+        #for i in range(len(self.layer_timesteps)):
+        for i in range(update_layers):
+            self.update.append(GGNNUpdateLayer(config))
+
+    def forward(self, edge_lists, node_states, pos_lists=None, node_types=None, test_time_steps=None):
+        old_node_states = node_states.clone()
+
+        if self.backward_edges:
+            back_edge_lists = [x.flip([1]) for x in edge_lists]
+            edge_lists.extend(back_edge_lists)
+
+            # For backward edges we keep the positions of the forward edge!
+            if self.position_embeddings:
+                pos_lists.extend(pos_lists)
+
+        # we allow for some fancy unrolling strategies.
+        # Currently only at eval time, but there is really no good reason for this.
+        assert self.unroll_strategy == 'none', 'New layer_timesteps not implemented for this unroll_strategy.'
+        #if self.training or self.unroll_strategy == "none":
+        #    #layer_timesteps =
+        #    #layer_timesteps = self.layer_timesteps
+        #elif self.unroll_strategy == "constant":
+        #    layer_timesteps = self.test_layer_timesteps
+        #elif self.unroll_strategy == "edge_count":
+        #    assert (
+        #        test_time_steps is not None
+        #    ), f"You need to pass test_time_steps or not use unroll_strategy '{self.unroll_strategy}''"
+        #    layer_timesteps = [min(test_time_steps, self.max_timesteps)]
+        #elif self.unroll_strategy == "data_flow_max_steps":
+        #    assert (
+        #        test_time_steps is not None
+        #    ), f"You need to pass test_time_steps or not use unroll_strategy '{self.unroll_strategy}''"
+        #    layer_timesteps = [min(test_time_steps, self.max_timesteps)]
+        #elif self.unroll_strategy == "label_convergence":
+        #    node_states, unroll_steps, converged = self.label_convergence_forward(
+        #        edge_lists, node_states, pos_lists, node_types, initial_node_states=old_node_states
+        #    )
+        #    return node_states, old_node_states, unroll_steps, converged
+        #else:
+        #    raise TypeError(
+        #        "Unreachable! "
+        #        f"Unroll strategy: {self.unroll_strategy}, training: {self.training}"
+        #    )
+
+        #for (layer_idx, num_timesteps) in enumerate(layer_timesteps):
+        #    for t in range(num_timesteps):
+        #        messages = self.message[layer_idx](edge_lists, node_states, pos_lists)
+        #        node_states = self.update[layer_idx](messages, node_states, node_types)
+
+        for i in range(self.gnn_layers):
+            m_idx = i // self.message_weight_sharing
+            u_idx = i // self.update_weight_sharing
+            messages = self.message[m_idx](edge_lists, node_states, pos_lists)
+            node_states = self.update[u_idx](messages, node_states, node_types)
+        return node_states, old_node_states
+
+    def label_convergence_forward(
+        self, edge_lists, node_states, pos_lists, node_types, initial_node_states
+    ):
+        assert (
+            len(self.layer_timesteps) == 1
+        ), f"Label convergence only supports one-layer GGNNs, but {len(self.layer_timesteps)} are configured in layer_timesteps: {self.layer_timesteps}"
+
+        stable_steps, i = 0, 0
+        old_tentative_labels = self.tentative_labels(
+            initial_node_states, node_states
+        )
+
+        while True:
+            messages = self.message[0](edge_lists, node_states, pos_lists)
+            node_states = self.update[0](messages, node_states, node_types)
+            new_tentative_labels = self.tentative_labels(
+                initial_node_states, node_states
+            )
+            i += 1
+
+            # return the new node states if their predictions match the old node states' predictions.
+            # It doesn't matter during testing since the predictions are the same anyway.
+            stability = (
+                (new_tentative_labels == old_tentative_labels)
+                .to(dtype=torch.get_default_dtype())
+                .mean()
+            )
+            if stability >= self.label_conv_threshold:
+                stable_steps += 1
+
+            if stable_steps >= self.label_conv_stable_steps:
+                return node_states, i, True
+
+            if i >= self.max_timesteps:  # maybe escape
+                return node_states, i, False
+
+            old_tentative_labels = new_tentative_labels
+
+        raise ValueError("Serious Design Error: Unreachable code!")
+
+    def tentative_labels(self, initial_node_states, node_states):
+        logits, _ = self.readout(initial_node_states, node_states)
+        preds = F.softmax(logits, dim=1)
+        predicted_labels = torch.argmax(preds, dim=1)
+        return predicted_labels
+
+
+class GraphTransformerEncoder(nn.Module):
+    def __init__(self, config, readout=None):
+        super().__init__()
+        self.backward_edges = config.backward_edges
+
+        self.gnn_layers = config.gnn_layers
+        self.message_weight_sharing = config.message_weight_sharing
+        self.update_weight_sharing = config.update_weight_sharing
+        message_layers = self.gnn_layers // self.message_weight_sharing
+        update_layers = self.gnn_layers // self.update_weight_sharing
+        assert message_layers * self.message_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
+        assert update_layers * self.update_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
+        #self.layer_timesteps = config.layer_timesteps
+
+        self.use_node_types = getattr(config, 'use_node_types', False)
+        assert not self.use_node_types, "not implemented"
+
+        # Position Embeddings
+        if getattr(config, 'position_embeddings', False):
+            self.selector_size = getattr(config, 'selector_size', 0)
+            self.emb_size = config.emb_size
+            # we are going to lookup the pos embs only once per batch instead of within every message layer
+            self.position_embs = PositionEmbeddings()
+            #self.register_buffer("position_embs",
+            #    PositionEmbeddings()(
+            #        torch.arange(512, dtype=torch.get_default_dtype()),
+            #        config.emb_size,
+            #        dpad=getattr(config, 'selector_size', 0),
+            #    ),
+            #)
+        else:
+            self.position_embs = None
+
+        # Message and update layers
+        self.message = nn.ModuleList()
+        #for i in range(len(self.layer_timesteps)):
+        for i in range(message_layers):
+            self.message.append(TypedSelfAttentionMessageLayer(config))
+
+        update_layer = getattr(config, 'update_layer', 'ff')
+        if update_layer == 'ff':
+            UpdateLayer = TransformerUpdateLayer
+        elif update_layer == 'gru':
+            UpdateLayer = GGNNUpdateLayer
+        else:
+            raise ValueError("config.update_layer has to be 'gru' or 'ff'!")
+
+        self.update = nn.ModuleList()
+        #for i in range(len(self.layer_timesteps)):
+        for i in range(update_layers):
+            self.update.append(UpdateLayer(config))
+
+    def forward(self, edge_lists, node_states, pos_lists=None, node_types=None, test_time_steps=None):
+        old_node_states = node_states.clone()
+
+        # gather position embeddings for each edge
+        pos_emb_lists = None
+        if getattr(self, 'position_embs') is not None:
+            pos_emb_lists = []
+            for i, pl in enumerate(pos_lists):
+                # p_emb = torch.index_select(self.position_embs, dim=0, index=pl)
+                p_emb = self.position_embs(pl.to(dtype=torch.get_default_dtype()), self.emb_size, dpad=self.selector_size)
+                pos_emb_lists.append(p_emb)
+
+        # Prepare for backward edges
+        if self.backward_edges:
+            back_edge_lists = [x.flip([1]) for x in edge_lists]
+            edge_lists.extend(back_edge_lists)
+
+            # For backward edges we keep the positions of the forward edge!
+            if pos_emb_lists:
+                pos_emb_lists.extend(pos_emb_lists)
+                assert len(pos_emb_lists) == len(edge_lists)
+
+        # Actual work
+        for i in range(self.gnn_layers):
+            m_idx = i // self.message_weight_sharing
+            u_idx = i // self.update_weight_sharing
+            messages = self.message[m_idx](edge_lists, node_states, pos_emb_lists)
+            node_states = self.update[u_idx](messages, node_states, node_types)
+        return node_states, old_node_states
+
+
+###### Message Layers
+
+class SelfAttentionMessageLayer(nn.Module):
+    """Implements transformer scaled dot-product self-attention, cf. Vaswani et al. 2017,
+    in a sparse setting on a graph. This reduces the time and space complexity
+    from O(N^2 * D) to O(M * D), which is much better if the graph has an average degree
+    that is << O(n), e.g. M \in O(n) instead of O(n^2)!
+
+    NB:
+    The layer shares the weight-layout with pytorch's dense implementation,
+        i.e. makes them interoperable.
+
+    Position information must be added to the node_states beforehand,
+        just like in the original.
+
+    Args:
+        edge_lists      list of edge_index tensors of shape <M_i, 2>
+        node_states     <N, D>
+        edges           alternatively a single edge_index <2, M>!
+                            (<2, M> is the torch geometric format!)
+    Returns:
+        attn_out:       messages <N, D>
+        attn_weights:   optionally the attention weights <M>
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        self.bias = config.attn_bias
+        self.num_heads = config.attn_num_heads
+        self.dropout_p = config.attn_dropout
+
+        head_dim = self.embed_dim // self.num_heads
+        assert head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        # projection from input to q, k, v
+        # Myle Ott et al. apparently observed that initializing the qkv_projection (in one matrix) with xavier_uni and gain 1/sqrt(2) to be much better than 1.
+        self.qkv_in_proj = LinearNet(self.embed_dim, self.embed_dim * 3, bias=self.bias, gain=1 / math.sqrt(2))
+        self.out_proj = LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
+        self.dropout = nn.Dropout(p=self.dropout_p, inplace=True)
+
+    def forward(self, edge_lists=None, node_states=None, pos_lists=None, edges=None, need_weights=False):
+        """NB: pos_lists are ignored."""
+
+        # Glue Code:
+        assert node_states is not None
+
+        # since we don't support edge-types in this layer, we just concatenate them here.
+        if edge_lists is not None:
+            assert edges is None
+            edges = torch.cat(edge_lists, dim=0).t()  # t()!
+        else:
+            assert edges is not None
+            edge_sources = edges[0, :]
+            edge_targets = edges[1, :]
+
+
+        # ~~~ Sparse Self-Attention ~~~
+        # The implementation follows the official pytorch implementation, but sparse.
+        # Legend:
+        #   Model hidden size D,
+        #   number of attention heads h,
+        #   number of edges M,
+        #   number of nodes N
+        num_nodes, embed_dim = node_states.size()
+        assert embed_dim == self.embed_dim
+
+        head_dim = embed_dim // self.num_heads
+        assert head_dim * self.num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+
+        # 1) get Q, K, V from node_states
+        #   (needs to be merged with step 2 if we want to use positions..., bc
+        #   they need to be added before retrieving Q, K, V)
+
+        q, k, v = self.qkv_in_proj(node_states).chunk(3, dim=1)
+
+
+        # 2) get Q', K', V' \in <M, D> by doing an F.emb lookup on Q, K, V (maybe transposed)
+        #   according to index
+        #       edge_target for Q, and
+        #       edge_sources for K, V
+        #   since the receiver of messages is querying her neighbors.
+        q_prime = torch.index_select(q, dim=0, index=edge_targets)
+        k_prime = torch.index_select(k, dim=0, index=edge_sources)
+        v_prime = torch.index_select(v, dim=0, index=edge_sources)
+
+        messages, attn_weights = self.sparse_attn_forward(
+                                        q_prime, k_prime, v_prime,
+                                        num_nodes, edge_targets, need_weights
+                                    )
+        if need_weights:
+            return messages, attn_weights
+        return messages
+
+    def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets, need_weights):
+        """Differently to dense self-attention, we expect q', k', v',
+        which are the query, key and value projected node_states [+pos embs]
+        index_selected by edge_targets, edge_sources and edge_source.
+        Args:
+            q_prime, k_prime, v_prime:  <M, embed_dim>
+            num_nodes:                  int(N)
+            edge_targets:               <M, 1>
+        Returns:
+            attn_out:                   messages <N, embed_dim>
+            attn_out_weights:           optionally: <M>
+        """
+        embed_dim = q_prime.size()[1]
+        head_dim = embed_dim // self.num_heads
+
+        # some checks
+        assert head_dim * self.num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert q_prime.size() == k_prime.size(), \
+            f"q_prime, k_prime size mismatch: {q_prime.size()}, {k_prime.size()}"
+        assert q_prime.size()[0] == v_prime.size()[0], 'number of queries and values mismatch'
+
+        # ~~~ Sparse Self-Attention ~~~
+        # The implementation follows the official pytorch implementation, but sparse.
+        # Legend:
+        #   Model hidden size D,
+        #   number of attention heads h,
+        #   number of edges M,
+        #   number of nodes N
+
+        # 3) Q' * K' (hadamard) and sum over D dimension,
+        #   then scaled by sqrt(D)
+        # 3*) multi-head: If we want h multiple heads, then we should only sum the h segments of size D//h here.
+        #   We will end up with <M, h> unnormalized attention scores.
+        scores_prime = q_prime * k_prime
+        # sum segments of head_dim size into num_head chunks
+        scores = scores_prime.transpose(0,1).view(self.num_heads, head_dim, -1).sum(dim=1).t().contiguous()
+        scaling = float(head_dim) ** -0.5
+        scores = scores * scaling
+        assert scores.size() == (q_prime.size()[0], self.num_heads) # <M, num_heads>
+
+        # 4) Scattered Softmax:
+        #   Perform a softmax by normalizing scores with the sum of those scores
+        #   where edge_targets coincide (meaning incoming edges to the same target are normalized)
+        #   we end up with <M> normalized self-attention scores
+        # 4*) multi-head: here we run the scattered_softmax in parallel over the h dimensions independently.
+
+        # <M, num_heads>
+        attn_output_weights = scatter_softmax(scores, index=edge_targets, num_nodes=num_nodes)  # noqa: F821
+        attn_output_weights = self.dropout(attn_output_weights)
+
+        # 5) V' * %4: weight values V' <M> by attention.
+        # The result up to here are the messages traveling across edges.
+        # 5* a) multi-head: get a view of V' with dim D_v split into <D//h, h>
+        #   then get back the old view
+        v_prime = v_prime.transpose(0, 1)
+        v_prime = v_prime.view(self.num_heads, head_dim, -1)
+        v_prime = v_prime.permute(2,0,1) # v_prime now: <M, num_heads, head_dim>
+
+        attn_out_per_edge = v_prime * attn_output_weights.unsqueeze(2)
+        attn_out_per_edge = attn_out_per_edge.view(-1, embed_dim)
+
+        # 6) Scatter Add: aggregate messages via index_add with index edge_target
+        # to end up with
+        #       messages <N, D>
+        attn_out = torch.zeros(num_nodes, embed_dim, dtype=torch.get_default_dtype(), device=q_prime.device)
+        attn_out.index_add_(0, edge_targets, attn_out_per_edge)
+
+        # 5* b) Additionally project from the concatenation back to D. cf. vaswani et al. 2017
+        attn_out = self.out_proj(attn_out)
+
+        # now we have messages_by_targets! finally...
+
+        if need_weights:
+            # average attention weights over heads (sorted like the edges)
+            attn_output_weights = attn_output_weights.sum(dim=1) / self.num_heads
+            return attn_out, attn_output_weights
+        return attn_out, None
+
+
+class TypedSelfAttentionMessageLayer(SelfAttentionMessageLayer):
+    """Implements transformer scaled dot-product self-attention, cf. Vaswani et al. 2017,
+    in a sparse setting on a graph. This reduces the time and space complexity
+    from O(N^2 * D) to O(M * D), which is much better if the graph has an average degree
+    that is << O(n), e.g. M \in O(n) instead of O(n^2)!
+
+    Graph Neural Network adaptations:
+        The layer supports different edge_types:
+            Each edge type gets their own k, v projection, but queries are shared.
+        The layer supports embedding edge-position information:
+            The position embedding is added to the attention keys only or
+            optionally both to k and v, but not to q.
+
+    Forward Args:
+        edge_lists:     list of edge_index tensors of size <M_i, 2>
+        node_states:    <N, emb_sz>
+        pos_lists:      OBS: We expect these to be pos_emb_lists each of size <M_i, emb_sz>
+        need_weights:   optionally return avg attention weights per edge of size <M>
+    Returns:
+        incoming messages per node of shape <N, D(+S)>
+    """
+
+    def __init__(self, config):
+        # init as a module without running parent __init__.
+        nn.Module.__init__(self)
+
+        self.edge_type_count = config.edge_type_count * 2 if config.backward_edges else config.edge_type_count
+        self.embed_dim = config.hidden_size
+        self.bias = config.attn_bias
+        self.num_heads = config.attn_num_heads
+        self.dropout_p = config.attn_dropout
+
+        head_dim = self.embed_dim // self.num_heads
+        assert head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.position_embs = getattr(config, 'position_embeddings', False)
+        self.attn_v_pos = getattr(config, 'attn_v_pos', False)
+        if not self.position_embs:
+            assert not self.attn_v_pos, "Use position_embeddings if you want attn_v_pos!"
+
+        # projection from input to q, k, v
+        # Myle Ott et al. apparently observed that initializing the qkv_projection (in one matrix)
+        #   with
+        #        nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+        #   to be much better than only xavier.
+        #self.qkv_in_proj = LinearNet(self.embed_dim, self.embed_dim * 3, bias=self.bias, gain=1 / math.sqrt(2))
+
+        # in projection per edge type.
+        self.q_proj = LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
+        self.k_proj = nn.ModuleList()
+        self.v_proj = nn.ModuleList()
+        for i in range(self.edge_type_count):
+            self.k_proj.append(LinearNet(self.embed_dim, self.embed_dim, bias=self.bias))
+            self.v_proj.append(LinearNet(self.embed_dim, self.embed_dim, bias=self.bias))
+
+        self.out_proj = LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
+        self.dropout = nn.Dropout(p=self.dropout_p, inplace=True)
+
+    def forward(self, edge_lists, node_states, pos_lists=None, need_weights=False):
+        """Args:
+            edge_lists:     list of edge_index tensors of size <M_i, 2>
+            node_states:    <N, emb_sz>
+            pos_lists:      OBS: We expect these to be pos_emb_lists each of size <M_i, emb_sz>
+            need_weights:   optionally return avg attention weights per edge of size <M>
+        """
+        assert len(edge_lists) == self.edge_type_count
+
+        num_nodes, embed_dim = node_states.size()
+        assert embed_dim == self.embed_dim
+
+        head_dim = embed_dim // self.num_heads
+        assert head_dim * self.num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+
+        # 1) get Q', K', V' \in <M, D>  from node_states
+        # by index_select according to index
+        #       edge_target for Q', and
+        #       edge_sources for K', V'
+        #   since the receiver of messages is querying her neighbors.
+        # 2) Optionally add pos_embs to K', V'
+        # 2) Then project from node_states into the q,k,v subspaces
+
+        q = self.q_proj(node_states)
+
+        q_primes, k_primes, v_primes, edge_targets_list = [], [], [], []
+
+        # carefully obtain keys and values and collect queries.
+        for i, el in enumerate(edge_lists):
+            edge_sources = el[:, 0]  # el <M_i, 2>
+            edge_targets = el[:, 1]
+            edge_targets_list.append(edge_targets)
+
+            q_prime = torch.index_select(q, dim=0, index=edge_targets)
+
+            selected_nodes = torch.index_select(node_states, dim=0, index=edge_sources)
+            # maybe add position embeddings
+            if self.position_embs and self.attn_v_pos:
+                selected_nodes = selected_nodes + pos_lists[i]
+                v_prime = self.v_proj[i](selected_nodes)
+                k_prime = self.k_proj[i](selected_nodes)
+            elif self.position_embs:  # but not on v
+                v_prime = self.v_proj[i](selected_nodes)
+                selected_nodes = selected_nodes + pos_lists[i]
+                k_prime = self.k_proj[i](selected_nodes)
+            else:
+                v_prime = self.v_proj[i](selected_nodes)
+                k_prime = self.k_proj[i](selected_nodes)
+
+            q_primes.append(q_prime)
+            k_primes.append(k_prime)
+            v_primes.append(v_prime)
+
+        edge_targets = torch.cat(edge_targets_list, dim=0)
+        q_prime = torch.cat(q_primes, dim=0)
+        k_prime = torch.cat(k_primes, dim=0)
+        v_prime = torch.cat(v_primes, dim=0)
+
+        # ~~~~ From here, we are back in the general sparse self-attention setting ~~~~~
+        messages, attn_weights = self.sparse_attn_forward(q_prime, k_prime, v_prime,
+                                                     num_nodes, edge_targets, need_weights)
+        if need_weights:
+            return messages, attn_weights
+        return messages
+
+
+class GGNNMessageLayer(nn.Module):
+    """Implements the MLP message function of the GGNN architecture,
+    optionally with position information embedded on edges.
+    Args:
+        edge_lists      (for each edge type) <M_i, 2>
+        node_states     <N, D+S>
+        pos_lists       <M> (optionally)
+    Returns:
+        incoming messages per node of shape <N, D+S>"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.edge_type_count = (
+            config.edge_type_count * 2
+            if config.backward_edges
+            else config.edge_type_count
+        )
+        self.msg_mean_aggregation = config.msg_mean_aggregation
+        self.dim = config.hidden_size
+
+        self.transform = LinearNet(
+            self.dim,
+            self.dim * self.edge_type_count,
+            bias=config.use_edge_bias,
+            dropout=config.edge_weight_dropout,
+        )
+
+        self.pos_transform = None
+        if getattr(config, 'position_embeddings', False):
+            self.selector_size = getattr(config, 'selector_size', 0)
+            self.emb_size = config.emb_size
+            self.position_embs = PositionEmbeddings()
+            
+            #legacy
+            #self.register_buffer(
+            #    "position_embs",
+            #    PositionEmbeddings()(
+            #        torch.arange(512, dtype=torch.get_default_dtype()),
+            #        config.emb_size,
+            #        dpad=getattr(config, 'selector_size', 0),
+            #    ),
+            #)
+            self.pos_transform = LinearNet(
+                self.dim,
+                self.dim,
+                bias=config.use_edge_bias,
+                dropout=config.edge_weight_dropout,
+            )
+
+    def forward(self, edge_lists, node_states, pos_lists=None):
+        """edge_lists: [<M_i, 2>, ...]"""
+
+        # all edge types are handled in one matrix, but we
+        # let propagated_states[i] be equal to the case with only edge_type i
+        #propagated_states = (
+        #    self.transform(node_states)
+        #    .transpose(0, 1)
+        #    .view(self.edge_type_count, self.dim, -1)
+        #)
+        propagated_states = self.transform(node_states).chunk(self.edge_type_count, dim=1)
+
+        messages_by_targets = torch.zeros_like(node_states)
+        if self.msg_mean_aggregation:
+            device = node_states.device
+            bincount = torch.zeros(
+                node_states.size()[0], dtype=torch.long, device=device
+            )
+
+        for i, edge_list in enumerate(edge_lists):
+            edge_targets = edge_list[:, 1]
+            edge_sources = edge_list[:, 0]
+
+            #messages_by_source = F.embedding(
+            #    edge_sources, propagated_states[i].transpose(0, 1)
+            #)
+            messages_by_source = torch.index_select(propagated_states[i], dim=0, index=edge_sources)
+
+            if self.pos_transform:
+                pos_list = pos_lists[i]
+                # torch.index_select(pos_gating, dim=0, index=pos_list)
+                pos_by_source = self.position_embs(pos_list.to(dtype=torch.get_default_dtype()), self.emb_size, dpad=self.selector_size)
+                
+                pos_gating_by_source = 2 * torch.sigmoid(self.pos_transform(pos_by_source))
+                
+                #messages_by_source.mul_(pos_by_source)
+                messages_by_source = messages_by_source * pos_gating_by_source
+
+            messages_by_targets.index_add_(0, edge_targets, messages_by_source)
+
+            if self.msg_mean_aggregation:
+                bins = edge_targets.bincount(minlength=node_states.size()[0])
+                bincount += bins
+
+        if self.msg_mean_aggregation:
+            divisor = bincount.float()
+            divisor[bincount == 0] = 1.0  # avoid div by zero for lonely nodes
+            #messages_by_targets /= divisor.unsqueeze_(1) + SMALL_NUMBER
+            messages_by_targets = messages_by_targets / divisor.unsqueeze_(1) + SMALL_NUMBER
+
+        return messages_by_targets
+
+
+class PositionEmbeddings(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, positions, demb, dpad: int = 0):
+        """Transformer-like sinusoidal positional embeddings.
+                Args:
+                position: 1d long Tensor of positions,
+                demb: int    size of embedding vector
+            """
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0, device=positions.device) / demb))
+
+        sinusoid_inp = torch.ger(positions, inv_freq)
+        pos_emb = torch.cat(
+            (torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1
+        )
+
+        if dpad > 0:
+            in_length = positions.size()[0]
+            pad = torch.zeros((in_length, dpad))
+            pos_emb = torch.cat([pos_emb, pad], dim=1)
+            assert torch.all(
+                pos_emb[:, -1] == torch.zeros(in_length)
+            ), f"test failed. pos_emb: \n{pos_emb}"
+
+        return pos_emb
+
+    # def forward(self, positions, dim, out):
+    #     assert dim > 0, f'dim of position embs has to be > 0'
+    #     power = 2 * (positions / 2) / dim
+    #     position_enc = np.array(
+    #         [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+    #          for pos in range(n_pos)])
+    #     out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    #     out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    #     out.detach_()
+    #     out.requires_grad = False
+
+
+#######  Update Layers
+
+class GGNNUpdateLayer(nn.Module):
+    """GRU update function of GGNN architecture, optionally distinguishing two kinds of node types.
+    Args:
+        incoming messages <N, D+S> (from message layer),
+        node_states <N, D+S>,
+        node_types <N> (optional)
+    Returns:
+        updated node_states <N, D+S>
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.dropout = config.graph_state_dropout
+        # TODO(github.com/ChrisCummins/ProGraML/issues/27): Maybe decouple hidden
+        # GRU size: make hidden GRU size larger and EdgeTrafo size non-square
+        # instead? Or implement stacking gru layers between message passing steps.
+
+        self.gru = nn.GRUCell(
+            input_size=config.hidden_size, hidden_size=config.hidden_size
+        )
+
+        # currently only admits node types 0 and 1 for statements and identifiers.
+        self.use_node_types = getattr(config, 'use_node_types', False)
+        if self.use_node_types:
+            self.id_gru = nn.GRUCell(
+                input_size=config.hidden_size, hidden_size=config.hidden_size
+            )
+
+    def forward(self, messages, node_states, node_types=None):
+        if self.use_node_types:
+            assert node_types is not None, "Need to provide node_types <N> if config.use_node_types!"
+            output = torch.zeros_like(node_states, device=node_states.device)
+            stmt_mask = node_types == 0
+            output[stmt_mask] = self.gru(messages[stmt_mask], node_states[stmt_mask])
+            id_mask = node_types == 1
+            output[id_mask] = self.id_gru(messages[id_mask], node_states[id_mask])
+        else:
+            output = self.gru(messages, node_states)
+
+        if self.dropout > 0.0:
+            F.dropout(output, p=self.dropout, training=self.training, inplace=True)
+        return output
+
+class TransformerUpdateLayer(nn.Module):
+    """Represents the residual MLP around the self-attention in the transformer
+    encoder layer. The implementation is sparse for usage in GNNs.
+
+    Args:
+        messages <N, D+S> (from self-attention layer)
+        node_states <N, D+S>
+        node_types <N> (optional and not yet implemented!)
+    Returns:
+        updated node_states
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.use_node_types = getattr(config, 'use_node_types', False)
+        assert not self.use_node_types, "not implemented"
+
+        activation = config.tfmr_act # relu or gelu, default relu
+        dropout = config.tfmr_dropout # default 0.1
+        dim_feedforward = config.tfmr_ff_sz # ~ 2.5 * model dim
+
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(config.hidden_size, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, config.hidden_size)
+
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.norm2 = nn.LayerNorm(config.hidden_size)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = self.get_activation_fn(activation)
+
+    def get_activation_fn(self, activation):
+        if activation == "relu":
+            return F.relu
+        elif activation == "gelu":
+            return F.gelu
+        else:
+            raise RuntimeError("activation should be relu/gelu, not %s." % activation)
+
+    def forward(self, messages, node_states, node_types=None):
+
+        # message layer is elsewhere!
+        #messages = self.self_attn(src, src, src)[0]
+
+        # 1st 'Add & Norm' block (cf. vaswani et al. 2017, fig. 1)
+        node_states = node_states + self.dropout1(messages)
+        node_states = self.norm1(node_states)
+
+        # 'Feed Forward' block
+        messages = self.linear2(self.dropout(self.activation(self.linear1(node_states))))
+
+        # 2nd 'Add & Norm' block
+        node_states = node_states + self.dropout2(messages)
+        node_states = self.norm2(node_states)
+
+        return node_states
+
+
+########################################
+# GNN Output Layers
+########################################
+
+
+class Readout(nn.Module):
+    """aka GatedRegression. See Eq. 4 in Gilmer et al. 2017 MPNN."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.has_graph_labels = config.has_graph_labels
+        self.num_classes = config.num_classes
+        self.use_tanh_readout = getattr(config, 'use_tanh_readout', False)
+
+        self.regression_gate = LinearNet(
+            2 * config.hidden_size, self.num_classes, dropout=config.output_dropout,
+        )
+        self.regression_transform = LinearNet(
+            config.hidden_size, self.num_classes, dropout=config.output_dropout,
+        )
+
+    def forward(self, raw_node_in, raw_node_out, graph_nodes_list=None,
+                num_graphs=None, auxiliary_features=None, readout_mask=None):
+        if readout_mask is not None:
+            # mask first to only process the stuff that goes into the loss function!
+            raw_node_in = raw_node_in[readout_mask]
+            raw_node_out = raw_node_out[readout_mask]
+            if graph_nodes_list is not None:
+                graph_nodes_list = graph_nodes_list[readout_mask]
+
+        gate_input = torch.cat((raw_node_in, raw_node_out), dim=-1)
+        gating = torch.sigmoid(self.regression_gate(gate_input))
+        if not self.use_tanh_readout:
+            nodewise_readout = gating * self.regression_transform(raw_node_out)
+        else:
+            nodewise_readout = gating * torch.tanh(self.regression_transform(raw_node_out))
+        
+        graph_readout = None
+        if self.has_graph_labels:
+            assert graph_nodes_list is not None and num_graphs is not None, 'has_graph_labels requires graph_nodes_list and num_graphs tensors.'
+            # aggregate via sums over graphs
+            device = raw_node_out.device
+            graph_readout = torch.zeros(num_graphs, self.num_classes, device=device)
+            graph_readout.index_add_(
+                dim=0, index=graph_nodes_list, source=nodewise_readout
+            )
+            if self.use_tanh_readout:
+                graph_readout = torch.tanh(graph_readout)
+        return nodewise_readout, graph_readout
+
+
+class LinearNet(nn.Module):
+    """Single Linear layer with WeightDropout, ReLU and Xavier Uniform
+    initialization. Applies a linear transformation to the incoming data:
+    :math:`y = xA^T + b`
+
+    Args:
+    in_features: size of each input sample
+    out_features: size of each output sample
+    bias: If set to ``False``, the layer will not learn an additive bias.
+    Default: ``True``
+
+    Shape:
+    - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
+    additional dimensions and :math:`H_{in} = \text{in\_features}`
+    - Output: :math:`(N, *, H_{out})` where all but the last dimension
+    are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
+    """
+
+    def __init__(self, in_features, out_features, bias=True, dropout=0.0, gain=1.0):
+        super().__init__()
+        self.dropout = dropout
+        self.in_features = in_features
+        self.out_features = out_features
+        self.gain = gain
+        self.test = nn.Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.test, gain=self.gain)
+        if self.bias is not None:
+            #    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            #    bound = 1 / math.sqrt(fan_in)
+            #    nn.init.uniform_(self.bias, -bound, bound)
+            nn.init.zeros_(self.bias)
+
+    def forward(self, input):
+        if self.dropout > 0.0:
+            w = F.dropout(self.test, p=self.dropout, training=self.training)
+        else:
+            w = self.test
+        return F.linear(input, w, self.bias)
+
+    def extra_repr(self):
+        return "in_features={}, out_features={}, bias={}, dropout={}".format(
+            self.in_features, self.out_features, self.bias is not None, self.dropout,
+        )
+
+
+###########################################
+# Mixing in graph-level features to readout
+
+class AuxiliaryReadout(nn.Module):
+    """Produces per-graph predictions by combining
+    the per-graph predictions with auxiliary features.
+    Note that this AuxiliaryReadout after Readout is probably a bad idea
+    and BetterAuxiliaryReadout should be used instead."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_classes = config.num_classes
+        self.aux_in_log1p = getattr(config, "aux_in_log1p", False)
+        assert (
+        config.has_graph_labels
+        ), "We expect aux readout in combination with graph labels, not node labels"
+        self.feed_forward = None
+
+        self.batch_norm = nn.BatchNorm1d(config.num_classes + config.aux_in_size)
+        self.feed_forward = nn.Sequential(
+            nn.Linear(
+                config.num_classes + config.aux_in_size, config.aux_in_layer_size,
+            ),
+            nn.ReLU(),
+            nn.Dropout(config.output_dropout),
+            nn.Linear(config.aux_in_layer_size, config.num_classes),
+        )
+
+    def forward(self, graph_features, auxiliary_features):
+        assert (
+        graph_features.size()[0] == auxiliary_features.size()[0]
+        ), "every graph needs aux_features. Dimension mismatch."
+        if self.aux_in_log1p:
+            auxiliary_features.log1p_()
+
+        aggregate_features = torch.cat((graph_features, auxiliary_features), dim=1)
+
+        normed_features = self.batch_norm(aggregate_features)
+        out = self.feed_forward(normed_features)
+        return out, graph_features
+
+
+
+class BetterAuxiliaryReadout(nn.Module):
+    """Produces per-graph predictions by combining
+    the raw GNN Encoder output with auxiliary features.
+    The difference to AuxReadout(Readout()) is that the aux info
+    is concat'ed before the nodewise readout and not after the
+    reduction to graphwise predictions.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.aux_in_log1p = getattr(config, "aux_in_log1p", False)
+        assert config.has_graph_labels, \
+            "We expect aux readout in combination with graph labels, not node labels"
+
+        self.has_graph_labels = config.has_graph_labels
+        self.num_classes = config.num_classes
+
+        # now with aux_in concat'ed and batchnorm
+        self.regression_gate = nn.Sequential(
+            nn.BatchNorm1d(2 * config.hidden_size + config.aux_in_size),
+            LinearNet(2 * config.hidden_size + config.aux_in_size,
+                      self.num_classes, dropout=config.output_dropout,
+            )
+        )
+        # now with aux_in concat'ed and with intermediate layer
+        self.regression_transform = nn.Sequential(
+            nn.BatchNorm1d(config.hidden_size + config.aux_in_size),
+            LinearNet(config.hidden_size + config.aux_in_size,
+                      config.aux_in_layer_size, dropout=config.output_dropout,
+            ),
+            nn.ReLU(),
+            LinearNet(config.aux_in_layer_size, config.num_classes),
+        )
+
+    def forward(self, raw_node_in, raw_node_out, graph_nodes_list, num_graphs, auxiliary_features, readout_mask=None):
+        assert graph_nodes_list is not None and auxiliary_features is not None, 'need those'
+        if readout_mask is not None:
+            # mask first to only process the stuff that goes into the loss function!
+            raw_node_in = raw_node_in[readout_mask]
+            raw_node_out = raw_node_out[readout_mask]
+            if graph_nodes_list is not None:
+                graph_nodes_list = graph_nodes_list[readout_mask]
+
+        if self.aux_in_log1p:
+            auxiliary_features.log1p_()
+        aux_by_node = torch.index_select(auxiliary_features, dim=0, index=graph_nodes_list)
+
+        # info: the gate and regression include batch norm inside!
+        gate_input = torch.cat((raw_node_in, raw_node_out, aux_by_node), dim=-1)
+        gating = torch.sigmoid(self.regression_gate(gate_input))
+        trafo_input = torch.cat((raw_node_out, aux_by_node), dim=-1)
+        nodewise_readout = gating * self.regression_transform(trafo_input)
+
+        graph_readout = None
+        if self.has_graph_labels:
+            assert graph_nodes_list is not None and num_graphs is not None, 'has_graph_labels requires graph_nodes_list and num_graphs tensors.'
+            # aggregate via sums over graphs
+            device = raw_node_out.device
+            graph_readout = torch.zeros(num_graphs, self.num_classes, device=device)
+            graph_readout.index_add_(
+                dim=0, index=graph_nodes_list, source=nodewise_readout
+            )
+        return nodewise_readout, graph_readout
+
+
+############################
+# GNN Input: Embedding Layers
+############################
+#class NodeEmbeddingsForPretraining(nn.Module):
+#    """NodeEmbeddings with added embedding for [MASK] token."""
+#
+#    def __init__(self, config):
+#        super().__init__()
+#
+#        print("Initializing with random embeddings for pretraining.")
+#        self.node_embs = nn.Embedding(config.vocab_size + 1, config.emb_size)
+#
+#    def forward(self, vocab_ids):
+#        embs = self.node_embs(vocab_ids)
+#        return embs
+
+
+class NodeEmbeddings(nn.Module):
+    """Construct node embeddings from node ids
+    Args:
+    pretrained_embeddings (Tensor, optional) – FloatTensor containing weights for
+    the Embedding. First dimension is being passed to Embedding as
+    num_embeddings, second as embedding_dim.
+
+    Forward
+    Args:
+    vocab_ids: <N, 1>
+    Returns:
+    node_states: <N, config.hidden_size>
+    """
+
+    # TODO(github.com/ChrisCummins/ProGraML/issues/27):: Maybe LayerNorm and
+    # Dropout on node_embeddings?
+    # TODO(github.com/ChrisCummins/ProGraML/issues/27):: Make selector embs
+    # trainable?
+
+    def __init__(self, config, pretrained_embeddings=None):
+        super().__init__()
+        self.inst2vec_embeddings = config.inst2vec_embeddings
+        self.emb_size = config.emb_size
+
+        if config.inst2vec_embeddings == "constant":
+            print("Using pre-trained inst2vec embeddings frozen.")
+            assert pretrained_embeddings is not None
+            assert pretrained_embeddings.size()[0] == 8568, "Wrong number of embs; don't come here with MLM models!"
+            self.node_embs = nn.Embedding.from_pretrained(
+                pretrained_embeddings, freeze=True
+            )
+        elif config.inst2vec_embeddings == "zero":
+            init = torch.zeros(config.vocab_size, config.emb_size)
+            self.node_embs = nn.Embedding.from_pretrained(init, freeze=True)
+        elif config.inst2vec_embeddings == "constant_random":
+            init = torch.rand(config.vocab_size, config.emb_size)
+            self.node_embs = nn.Embedding.from_pretrained(init, freeze=True)
+        elif config.inst2vec_embeddings == "finetune":
+            print("Fine-tuning inst2vec embeddings")
+            assert pretrained_embeddings is not None
+            assert pretrained_embeddings.size()[0] == 8568, "Wrong number of embs; don't come here with MLM models!"
+            self.node_embs = nn.Embedding.from_pretrained(
+                pretrained_embeddings, freeze=False
+            )
+        elif config.inst2vec_embeddings == "random":
+            print("Initializing with random embeddings")
+            self.node_embs = nn.Embedding(config.vocab_size, config.emb_size)
+        elif config.inst2vec_embeddings == "none":
+            print("Initializing with a embedding for statements and identifiers each.")
+            self.node_embs = nn.Embedding(2, config.emb_size)
+        else:
+            raise NotImplementedError(config.inst2vec_embeddings)
+
+
+    def forward(self, vocab_ids, *ignored_args, **ignored_kwargs):
+        if self.inst2vec_embeddings == 'none':
+            # map IDs to 1 and everything else to 0
+            ids = (vocab_ids == 8565).to(torch.long)  # !IDENTIFIER token id
+            embs = self.node_embs(ids)
+        else:  # normal embeddings
+            embs = self.node_embs(vocab_ids)
+
+        return embs
+
+
+class NodeEmbeddingsWithSelectors(NodeEmbeddings):
+    """Construct node embeddings as content embeddings + selector embeddings.
+
+    Args:
+    pretrained_embeddings (Tensor, optional) – FloatTensor containing weights for
+    the Embedding. First dimension is being passed to Embedding as
+    num_embeddings, second as embedding_dim.
+
+    Forward
+    Args:
+    vocab_ids: <N, 1>
+    selector_ids: <N, 1>
+    Returns:
+    node_states: <N, config.hidden_size>
+    """
+    def __init__(self, config, pretrained_embeddings=None):
+        super().__init__(config, pretrained_embeddings)
+
+        self.node_embs = super().forward
+        assert config.use_selector_embeddings, "This Module is for use with use_selector_embeddings!"
+
+        selector_init = torch.tensor(
+            # TODO(github.com/ChrisCummins/ProGraML/issues/27): x50 is maybe a
+            # problem for unrolling (for selector_embs)?
+            [[0, 50.0], [50.0, 0]],
+            dtype=torch.get_default_dtype(),
+        )
+        self.selector_embs = nn.Embedding.from_pretrained(
+            selector_init, freeze=True
+        )
+
+    def forward(self, vocab_ids, selector_ids):
+        node_embs = self.node_embs(vocab_ids)
+        selector_embs = self.selector_embs(selector_ids)
+        embs = torch.cat((node_embs, selector_embs), dim=1)
+        return embs
+
+
+#############################
+# Loss Accuracy Prediction
+#############################
+
+
+class Loss(nn.Module):
+    """Cross Entropy loss with weighted intermediate loss, and
+    L2 loss if num_classes is just 1.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if config.num_classes == 1:
+            # self.loss = nn.BCEWithLogitsLoss()  # in: (N, *), target: (N, *)
+            self.loss = nn.MSELoss()
+            # self.loss = nn.L1Loss()
+        else:
+            # class labels '-1' don't contribute to the gradient!
+            # however in most cases it will be more efficient to gather
+            # the relevant data into a dense tensor
+            self.loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
+            #loss = F.nll_loss(
+            #    F.log_softmax(logits, dim=-1, dtype=torch.float32),
+            #    targets,
+            #    reduction='mean',
+            #    ignore_index=-1,
+            #)
+
+    def forward(self, logits, targets):
+        """inputs: (logits) or (logits, intermediate_logits)"""
+        if self.config.num_classes == 1:
+            l = torch.sigmoid(logits[0])
+            logits = (l, logits[1])
+        loss = self.loss(logits[0].squeeze(dim=1), targets)
+        if getattr(self.config, 'has_aux_input', False):
+            loss = loss + self.config.intermediate_loss_weight * self.loss(
+                logits[1], targets
+            )
+        return loss
+
+
+class Metrics(nn.Module):
+    """Common metrics and info for inspection of results.
+    Args:
+    logits, labels
+    Returns:
+    (accuracy, pred_targets, correct_preds, targets)"""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, logits, labels, runtimes=None):
+        # be flexible with 1hot labels vs indices
+        if len(labels.size()) == 2:
+            targets = labels.argmax(dim=1)
+        elif len(labels.size()) == 1:
+            targets = labels
+        else:
+            raise ValueError(f"labels={labels.size()} tensor is is neither 1 nor 2-dimensional. :/")
+
+
+        pred_targets = logits.argmax(dim=1)
+        correct_preds = targets.eq(pred_targets).float()
+        accuracy = torch.mean(correct_preds)
+
+        ret = accuracy, correct_preds, targets
+
+        if runtimes is not None:
+            assert runtimes.size() == logits.size(), \
+                f"We need to have a runtime for each sample and every possible label!" \
+                f"runtimes={runtimes.size()}, logits={logits.size()}."
+            #actual = runtimes[pred#torch.index_select(runtimes, dim=1, index=pred_targets)
+            actual = torch.gather(runtimes, dim=1, index=pred_targets.view(-1, 1)).squeeze()
+            #actual = runtimes[:, pred_targets]
+            optimal = torch.gather(runtimes, dim=1, index=targets.view(-1, 1)).squeeze()
+            #optimal = runtimes[:, targets]
+            ret += (actual, optimal)
+
+        return ret
+
+
+# Huggingface implementation
+# perplexity = torch.exp(torch.tensor(eval_loss)), where loss is just the ave
diff --git a/programl/task/graph_level_classification/run.py b/programl/task/graph_level_classification/run.py
new file mode 100644
index 000000000..9ba355e04
--- /dev/null
+++ b/programl/task/graph_level_classification/run.py
@@ -0,0 +1,831 @@
+"""
+Usage:
+   run.py [options]
+
+Options:
+    -h --help                       Show this screen.
+    --data_dir DATA_DIR             Directory(*) to of dataset. (*)=relative to repository root ProGraML/.
+                                        Will overwrite the per-dataset defaults if provided.
+
+    --log_dir LOG_DIR               Directory(*) to store logfiles and trained models relative to repository dir.
+                                        [default: deeplearning/ml4pl/poj104/logs/unspecified]
+    --model MODEL                   The model to run.
+    --dataset DATASET               The dataset to us.
+    --config CONFIG                 Path(*) to a config json dump with params.
+    --config_json CONFIG_JSON       Config json with params.
+    --restore CHECKPOINT            Path(*) to a model file to restore from.
+    --skip_restore_config           Whether to skip restoring the config from CHECKPOINT.
+    --test                          Test the model without training.
+    --restore_by_pattern PATTERN    Restore newest model of this name from log_dir and
+                                        continue training. (AULT specific!)
+                                        PATTERN is a string that can be grep'ed for.
+    --kfold                         Run kfold cross-validation iff kfold is set.
+                                        Splits are currently dataset specific.
+    --transfer MODEL                The model-class to transfer to.
+                                    The args specified will be applied to the transferred model to the extend applicable, e.g.
+                                        training params and Readout module specifications, but not to the transferred model trunk.
+                                        However, we strongly recommend to make all trunk-parameters match, in order to be able
+                                        to restore from transferred checkpoints without having to pass a matching config manually.
+    --transfer_mode MODE            One of frozen, finetune (but not yet implemented) [default: frozen]
+                                        Mode frozen also sets all dropout in the restored model to zero (the newly initialized
+                                        readout function can have dropout nonetheless, depending on the config provided).
+    --skip_save_every_epoch         Save latest model after every epoch (on a rolling basis).
+"""
+
+
+import pickle, time, os, json, sys
+from pathlib import Path
+
+from docopt import docopt
+import tqdm
+import numpy as np
+import torch
+from torch_geometric.data import Data, InMemoryDataset, DataLoader # (see below)
+
+# make this file executable from anywhere
+#if __name__ == '__main__':
+full_path = os.path.realpath(__file__)
+print(full_path)
+REPO_ROOT = full_path.rsplit('ProGraML', maxsplit=1)[0] + 'ProGraML'
+print(REPO_ROOT)
+#insert at 1, 0 is the script path (or '' in REPL)
+sys.path.insert(1, REPO_ROOT)
+REPO_ROOT = Path(REPO_ROOT)
+
+from deeplearning.ml4pl.poj104.dataloader import NodeLimitedDataLoader
+
+from deeplearning.ml4pl.models.ggnn.modeling import (
+    GGNNModel,
+    GraphTransformerModel,
+)
+from deeplearning.ml4pl.models.ggnn.configs import (
+    ProGraMLBaseConfig,
+    GGNN_POJ104_Config,
+    GGNN_ForPretraining_Config,
+    GGNN_Devmap_Config,
+    GGNN_Threadcoarsening_Config,
+    GGNN_BranchPrediction_Config,
+    GraphTransformer_POJ104_Config,
+    GraphTransformer_Devmap_Config,
+    GraphTransformer_Threadcoarsening_Config,
+    GraphTransformer_BranchPrediction_Config,
+    GraphTransformer_ForPretraining_Config,
+)
+
+from deeplearning.ml4pl.poj104.dataset import (
+    POJ104Dataset,
+    NCCDataset,
+    ThreadcoarseningDataset,
+    DevmapDataset,
+    BranchPredictionDataset,
+)
+
+# Importing twice like this enables restoring
+from deeplearning.ml4pl.models.ggnn import modeling
+from deeplearning.ml4pl.models.ggnn import configs
+
+
+
+
+# Slurm gives us among others: SLURM_JOBID, SLURM_JOB_NAME,
+# SLURM_JOB_DEPENDENCY (set to the value of the --dependency option)
+if os.environ.get('SLURM_JOBID'):
+    print('SLURM_JOB_NAME', os.environ.get('SLURM_JOB_NAME', ''))
+    print('SLURM_JOBID', os.environ.get('SLURM_JOBID', ''))
+    RUN_ID = "_".join([os.environ.get('SLURM_JOB_NAME', ''), os.environ.get('SLURM_JOBID')])
+else:
+    RUN_ID = str(os.getpid())
+
+
+
+
+MODEL_CLASSES = {
+    'ggnn_poj104': (GGNNModel, GGNN_POJ104_Config),
+    'ggnn_devmap': (GGNNModel, GGNN_Devmap_Config),
+    'ggnn_threadcoarsening': (GGNNModel, GGNN_Threadcoarsening_Config),
+    'ggnn_branch_prediction': (GGNNModel, GGNN_BranchPrediction_Config),
+    'ggnn_pretraining': (GGNNModel, GGNN_ForPretraining_Config),
+    'transformer_poj104': (GraphTransformerModel, GraphTransformer_POJ104_Config),
+    'transformer_devmap': (GraphTransformerModel, GraphTransformer_Devmap_Config),
+    'transformer_threadcoarsening': (GraphTransformerModel, GraphTransformer_Threadcoarsening_Config),
+    'transformer_branch_prediction': (GraphTransformerModel, GraphTransformer_BranchPrediction_Config),
+    'transformer_pretraining': (GraphTransformerModel, GraphTransformer_ForPretraining_Config),
+}
+
+DATASET_CLASSES = { #DS, default data_dir,
+    'poj104': (POJ104Dataset, 'deeplearning/ml4pl/poj104/classifyapp_data'),
+    'ncc': (NCCDataset, 'deeplearning/ml4pl/poj104/ncc_data'),
+    'devmap_amd': (DevmapDataset, 'deeplearning/ml4pl/poj104/devmap_data'),
+    'devmap_nvidia': (DevmapDataset, 'deeplearning/ml4pl/poj104/devmap_data'),
+    'threadcoarsening_Cypress': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
+    'threadcoarsening_Tahiti': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
+    'threadcoarsening_Fermi': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
+    'threadcoarsening_Kepler': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
+    'branch_prediction': (BranchPredictionDataset, 'deeplearning/ml4pl/poj104/branch_prediction_data'),
+}
+
+DEBUG = False
+if DEBUG:
+    torch.autograd.set_detect_anomaly(True)
+
+class Learner(object):
+    def __init__(self, model, dataset, args=None, current_kfold_split=None):
+        # Make class work without file being run as main
+        self.args = docopt(__doc__, argv=[])
+        if args:
+            self.args.update(args)
+
+        # prepare logging
+        self.parent_run_id = None  # for restored models
+        self.run_id = f"{time.strftime('%Y-%m-%d_%H:%M:%S')}_{RUN_ID}"
+        if args['--kfold']:
+            self.run_id += f'_{current_kfold_split}'
+
+        log_dir = REPO_ROOT / self.args.get("--log_dir", '.')
+        log_dir.mkdir(parents=True, exist_ok=True)
+        self.log_file = log_dir / f"{self.run_id}_log.json"
+        self.best_model_file = log_dir / f"{self.run_id}_model_best.pickle"
+        self.last_model_file = log_dir / f"{self.run_id}_model_last.pickle"
+
+        # ~~~~~~~~~~ load model ~~~~~~~~~~~~~
+        if self.args.get('--restore'):
+            self.model = self.restore_model(path=REPO_ROOT / self.args['--restore'])
+        elif self.args.get('--restore_by_pattern'):
+            self.model = self.restore_by_pattern(pattern=self.args['--restore_by_pattern'],
+                                                 log_dir=log_dir,
+                                                 current_kfold_split=current_kfold_split)
+        else: # initialize fresh model
+            # get model and dataset
+            assert model, "Need to provide --model to initialize freshly."
+            Model, Config = MODEL_CLASSES[model]
+
+            self.global_training_step = 0
+            self.current_epoch = 1
+
+            # get config
+            params = self.parse_config_params(args)
+            self.config = Config.from_dict(params=params)
+
+            test_only = self.args.get('--test', False)
+            self.model = Model(self.config, test_only=test_only)
+
+        # set seeds, NB: the NN on CUDA is partially non-deterministic!
+        torch.manual_seed(self.config.random_seed)
+        np.random.seed(self.config.random_seed)
+
+        # ~~~~~~~~~~ transfer model ~~~~~~~~
+        if self.args['--transfer'] is not None:
+            self.transfer_model(self.args['--transfer'], self.args['--transfer_mode'])
+
+
+        # ~~~~~~~~~~ load data ~~~~~~~~~~~~~
+        self.load_data(dataset, args['--kfold'], current_kfold_split)
+
+        # log config to file
+        config_dict = self.config.to_dict()
+        with open(log_dir / f"{self.run_id}_params.json", "w") as f:
+            json.dump(config_dict, f)
+
+        # log parent run to file if run was restored
+        if self.parent_run_id:
+            with open(log_dir / f"{self.run_id}_parent.json", "w") as f:
+                json.dump({
+                    "parent": self.parent_run_id,
+                    "self": self.run_id,
+                    "self_config": config_dict,
+                }, f)
+
+        print(
+            "Run %s starting with following parameters:\n%s"
+            % (self.run_id, json.dumps(config_dict))
+        )
+
+    def load_data(self, dataset, kfold, current_kfold_split):
+        """Set self.train_data, self.test_data, self.valid_data depending on the dataset used."""
+        if not kfold: assert current_kfold_split is None
+        if '_' in dataset:
+            split = dataset.rsplit('_', maxsplit=1)[-1]
+        Dataset, data_dir = DATASET_CLASSES[dataset]
+        if self.args.get('--data_dir', '.'):
+            self.data_dir = REPO_ROOT / self.args.get('--data_dir', '.')
+        else:
+            self.data_dir = REPO_ROOT / data_dir
+
+        # Switch cases by dataset
+        # ~~~~~~~~~~ NCC ~~~~~~~~~~~~~~~~~~~~~
+        if dataset == 'ncc':
+            # train set
+            if not self.args.get('--test'):
+                # take train_subset=[90,100] as validation data
+                if self.config.train_subset == [0, 100]:
+                    print(f"!!!!!!!!  WARNING !!!!!!!!!!!!")
+                    print(f"SETTING TRAIN_SUBSET FROM [0,100] TO [0, 90]")
+                    print(f"!!!!!!!!  WARNING !!!!!!!!!!!!")
+                    self.config.train_subset = [0,90]
+                train_dataset = Dataset(root=self.data_dir, split='train', train_subset=self.config.train_subset)
+                train_dataset = train_dataset.filter_max_num_nodes(self.config.max_num_nodes)
+                self.train_data = NodeLimitedDataLoader(train_dataset,
+                                             batch_size=self.config.batch_size,
+                                             shuffle=True,
+                                             max_num_nodes=self.config.max_num_nodes,
+                                             warn_on_limit=True,
+                                            )
+            # valid set (and test set)
+            valid_dataset = Dataset(root=self.data_dir, split='train', train_subset=[90,100])
+            valid_dataset = valid_dataset.filter_max_num_nodes(self.config.max_num_nodes)
+            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False)
+            self.test_data = None
+        # ~~~~~~~~~~ POJ 104 ~~~~~~~~~~~~~~~~~~~~~
+        elif dataset == 'poj104':
+            if not self.args.get('--test'):
+                train_dataset = Dataset(root=self.data_dir,
+                                        split='train',
+                                        train_subset=self.config.train_subset,
+                                        cdfg=self.config.cdfg_vocab,
+                                        ablation_vocab=self.config.ablation_vocab)
+                self.train_data = DataLoader(train_dataset,
+                                             batch_size=self.config.batch_size,
+                                             shuffle=True,
+                                             #max_num_nodes=self.config.max_num_nodes
+                                            )
+
+            self.valid_data = DataLoader(Dataset(root=self.data_dir,
+                                                 split='val',
+                                                 cdfg=self.config.cdfg_vocab,
+                                                 ablation_vocab=self.config.ablation_vocab),
+                                            batch_size=self.config.batch_size * 2,
+                                            shuffle=False
+                                         )
+            self.test_data = DataLoader(Dataset(root=self.data_dir,
+                                                split='test',
+                                                cdfg=self.config.cdfg_vocab,
+                                                ablation_vocab=self.config.ablation_vocab),
+                                            batch_size=self.config.batch_size * 2,
+                                            shuffle=False
+                                        )
+
+        # ~~~~~~~~~~ DEVMAP ~~~~~~~~~~~~~~~~~~~~~
+        elif dataset in ['devmap_amd', 'devmap_nvidia']:
+            assert kfold and current_kfold_split is not None, "Devmap only supported with kfold flag!"
+            assert current_kfold_split < 10
+            # get the whole dataset then get the correct split
+            ds = Dataset(root=self.data_dir,
+                         split=split,
+                         train_subset=self.config.train_subset,
+                         cdfg=self.config.cdfg,
+                         ablation_vocab=self.config.ablation_vocab)
+            train_dataset, valid_dataset = ds.return_cross_validation_splits(current_kfold_split)
+
+            self.train_data = None
+            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False)
+
+            # only maybe set train_data.
+            if not self.args.get('--test'):
+                self.train_data = DataLoader(train_dataset,
+                                batch_size=self.config.batch_size,
+                                shuffle=True,
+                            )
+            self.test_data = None
+
+        # ~~~~~~~~~~ THREADCOARSENING ~~~~~~~~~~~~~~~~~~~~~
+        elif dataset in ['threadcoarsening' + '_' + s for s in ['Cypress', 'Tahiti', 'Fermi', 'Kepler']]:
+            assert kfold and current_kfold_split is not None, "Threadcoarsening only supported with kfold flag!"
+            assert current_kfold_split < 17 and current_kfold_split >= 0
+            if not self.args.get('--test'):
+                pass
+            # get the whole dataset then get the correct split
+            ds = Dataset(root=self.data_dir, split=split, train_subset=self.config.train_subset)
+            train_dataset, valid_dataset = ds.return_cross_validation_splits(current_kfold_split)
+
+            self.train_data = None
+            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False)
+
+            # only maybe set train_data.
+            if not self.args.get('--test'):
+                self.train_data = DataLoader(train_dataset,
+                                batch_size=self.config.batch_size,
+                                shuffle=True,
+                            )
+            self.test_data = None
+
+        # ~~~~~~~~~~~~ Branch Prediction ~~~~~~~~~~~~~~~~~~~~
+        elif dataset in ['branch_prediction']:
+            assert kfold and current_kfold_split is not None, "Branch Prediction only supported with kfold flag!"
+            assert current_kfold_split < 10
+            # train set
+            ds = Dataset(root=self.data_dir, split='train', train_subset=self.config.train_subset)
+            ds = ds.filter_max_num_nodes(self.config.max_num_nodes)
+            
+            train_dataset, valid_dataset = ds.return_cross_validation_splits(current_kfold_split)
+            #train_dataset.filter_max_num_nodes(self.config.max_num_nodes)
+            #valid_dataset.filter_max_num_nodes(self.config.max_num_nodes)
+            self.train_data = NodeLimitedDataLoader(train_dataset,
+                    batch_size=self.config.batch_size,
+                    shuffle=True,
+                    max_num_nodes=self.config.max_num_nodes,
+                    warn_on_limit=False,
+            )
+            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size, shuffle=False)
+            
+            self.test_data = None
+        # ~~~~~~~~~~~ Unknow Dataset ~~~~~~~~~~~~~~~~~
+        else:
+            raise NotImplementedError
+
+    def parse_config_params(self, args):
+        """Accesses self.args to parse config params from various flags."""
+        params = None
+        if args.get('--config'):
+            with open(REPO_ROOT / args['--config'], 'r') as f:
+                params = json.load(f)
+        elif args.get('--config_json'):
+            config_string = args['--config_json']
+            # accept single quoted 'json'. This only works bc our json strings are simple enough.
+            config_string = config_string.replace("\\'", "'").replace("'", '"').replace('True', 'true').replace('False', 'false')
+            params = json.loads(config_string)
+        return params
+
+    def data2input(self, batch):
+        """Glue method that converts a batch from the dataloader into the input format of the model"""
+        num_graphs = batch.batch[-1].item() + 1
+
+        edge_lists = []
+        edge_positions = [] if getattr(self.config, 'position_embeddings', False) else None
+
+        edge_indices = list(range(3))
+        if self.config.ablate_structure:
+            if self.config.ablate_structure == 'control':
+                edge_indices[0] = -1
+            elif self.config.ablate_structure == 'data':
+                edge_indices[1] = -1
+            elif self.config.ablate_structure == 'call':
+                edge_indices[2] = -1
+            else:
+                raise ValueError("unreachable")
+
+        for i in edge_indices:
+            # mask by edge type
+            mask = batch.edge_attr[:, 0] == i          # <M_i>
+            edge_list = batch.edge_index[:, mask].t()  # <et, M_i>
+            edge_lists.append(edge_list)
+
+            if getattr(self.config, 'position_embeddings', False):
+                edge_pos = batch.edge_attr[mask, 1]    # <M_i>
+                edge_positions.append(edge_pos)
+
+        inputs = {
+            "vocab_ids": batch.x[:,0],
+            "edge_lists": edge_lists,
+            "pos_lists": edge_positions,
+            "num_graphs": num_graphs,
+            "graph_nodes_list": batch.batch,
+            "node_types": batch.x[:,1],
+        }
+
+        # maybe add labels
+        if batch.y is not None:
+            inputs.update({
+                "labels": batch.y,
+            })
+
+        # add other stuff
+        if hasattr(batch, 'aux_in'):
+           inputs.update({
+               "aux_in": batch.aux_in.to(dtype=torch.float)
+           })
+        if hasattr(batch, 'runtimes'):
+            inputs.update({
+                "runtimes": batch.runtimes.to(dtype=torch.float)
+            })
+        return inputs
+    
+    def make_branch_labels(self, batch):
+        """takes a batch and maps the profile info to branch labels for regression:
+        a branch has (true_weight+1, false_weight+1, total_weight+2) and we map to [0, 1] as
+                p(true) = true_weight / total_weight
+        note that the profile info adds 1 on both true and false weights!
+        """
+        mask = batch.profile_info[:, 0].bool()
+        # clamp to be robust against 0 counts from problems with the data
+        yes = torch.clamp(batch.profile_info[:,1].to(dtype=torch.get_default_dtype()) - 1, min=0.0)
+        total = 1e-7 + torch.clamp(batch.profile_info[:,3].to(torch.get_default_dtype()) - 2, min=0.0)
+        p_yes = yes / total# true / total
+        p_yes = torch.clamp(p_yes, min=0.0, max=1.0)
+        # print([str(a) for a in p_yes[mask].clone().detach().to('cpu').numpy()])
+        return p_yes, mask
+
+    def bertify_batch(self, batch, config):
+        """takes a batch and returns the bertified input, labels and corresponding mask,
+        indicating what part of the input to predict."""
+        vocab_ids = batch.x[:, 0]
+        labels = vocab_ids.clone()
+        node_types = batch.x[:, 1]
+        device = vocab_ids.device
+
+        # we create a tensor that carries the probability of being masked for each node
+        probabilities = torch.full(vocab_ids.size(), config.mlm_probability, device=device)
+        # set to 0.0 where nodes are !IDENTIFIERS, i.e. node_types == 1
+        if config.mlm_statements_only:
+            probabilities.masked_fill_(node_types.bool(), 0.0)
+        # set to 0.0 where statements are !UNK
+        if config.mlm_exclude_unk_tokens:
+            probabilities.masked_fill_(vocab_ids == config.unk_token_id, 0.0)
+
+        # get the node mask that determines the nodes we use as targets
+        mlm_target_mask = torch.bernoulli(probabilities).bool()
+        # of those, get the 80% where the input is masked
+        masked_out_nodes = torch.bernoulli(torch.full(vocab_ids.size(), 0.8, device=device)).bool() & mlm_target_mask
+
+        # the 10% where it's set to a random token
+        # (as 50% of the target nodes that are not masked out)
+        random_nodes = torch.bernoulli(torch.full(vocab_ids.size(), 0.5, device=device)).bool() & mlm_target_mask & ~masked_out_nodes
+        # and the 10% where it's the original id, we just leave alone.
+
+        # apply the changes
+        random_ids = torch.randint(config.vocab_size, vocab_ids.shape, dtype=torch.long, device=device)
+        vocab_ids[masked_out_nodes] = config.mlm_mask_token_id
+        vocab_ids[random_nodes] = random_ids[random_nodes]
+        # the loss function can ignore -1 labels for gradients,
+        # although it's more efficient to gather according to the mlm_target_mask mask
+        labels[~mlm_target_mask] = -1
+
+        return vocab_ids, labels, mlm_target_mask
+
+    def run_epoch(self, loader, epoch_type, analysis_mode=False):
+        """
+        args:
+            loader: a pytorch-geometric dataset loader,
+            epoch_type: 'train' or 'eval'
+        returns:
+            loss, accuracy, instance_per_second
+        """
+
+        bar = tqdm.tqdm(total=len(loader.dataset), smoothing=0.01, unit='inst')
+        if analysis_mode:
+            saved_outputs = []
+
+        epoch_loss, epoch_accuracy = 0, 0
+        epoch_actual_rt, epoch_optimal_rt = 0, 0
+        start_time = time.time()
+        processed_graphs = 0
+        predicted_targets = 0
+
+        for step, batch in enumerate(loader):
+            ######### prepare input
+            # move batch to gpu and prepare input tensors:
+            batch.to(self.model.dev)
+
+            inputs = self.data2input(batch)
+            num_graphs = inputs['num_graphs']
+
+             # only implemented nodewise model are for pretraining currently
+            if self.config.name in ['GGNN_ForPretraining_Config', 'GraphTransformer_ForPretraining_Config']:
+                mlm_vocab_ids, mlm_labels, mlm_target_mask = self.bertify_batch(batch, self.config)
+                inputs.update({
+                    'vocab_ids': mlm_vocab_ids,
+                    'labels': mlm_labels,
+                    'readout_mask': mlm_target_mask,
+                })
+                num_targets = torch.sum(mlm_target_mask.to(torch.long)).item()
+            elif self.config.name in ['GGNN_BranchPrediction_Config', 'GraphTransformer_BranchPrediction_Config']:
+                y, mask = self.make_branch_labels(batch)
+                inputs.update({
+                    'labels': y,
+                    'readout_mask': mask,
+                })
+                if not torch.any(mask):
+                    print('Warning: batch has no labels! skipping.......')
+                    continue
+                num_targets = torch.sum(mask.to(torch.long)).item()
+            # elif: other nodewise configs go here!
+            elif getattr(self.config, 'has_graph_labels', False): # all graph models
+                num_targets = num_graphs
+            else:
+                raise NotImplementedError("We don't have other nodewise models currently.")
+
+            predicted_targets += num_targets
+            processed_graphs += num_graphs
+
+            #############
+            # RUN MODEL FORWARD PASS
+
+            # enter correct mode of model and fetch output
+            if epoch_type == "train":
+                self.global_training_step += 1
+                if not self.model.training:
+                    self.model.train()
+                outputs = self.model(**inputs)
+            else:  # not TRAIN
+                if self.model.training:
+                    self.model.eval()
+                    self.model.opt.zero_grad()
+                with torch.no_grad():  # don't trace computation graph!
+                    outputs = self.model(**inputs)
+
+            if analysis_mode:
+                # TODO I don't know whether the outputs are properly cloned, moved to cpu and detached or not.
+                saved_outputs.append(outputs)
+
+            if hasattr(batch, 'runtimes'):
+                (logits, accuracy, correct, targets, actual_rt, optimal_rt, graph_features, *unroll_stats,
+                ) = outputs
+                epoch_actual_rt += torch.sum(actual_rt).item()
+                epoch_optimal_rt += torch.sum(optimal_rt).item()
+            else:
+                (logits, accuracy, correct, targets, graph_features, *unroll_stats,
+                ) = outputs
+            loss = self.model.loss((logits, graph_features), targets)
+
+            epoch_loss += loss.item() * num_targets
+            epoch_accuracy += accuracy.item() * num_targets
+
+            # update weights
+            if epoch_type == "train":
+                loss.backward()
+                if self.model.config.clip_grad_norm > 0.0:
+                    torch.nn.utils.clip_grad_norm_(
+                        self.model.parameters(), self.model.config.clip_grad_norm
+                    )
+                self.model.opt.step()
+                self.model.opt.zero_grad()
+
+            # update bar display
+            bar_loss = epoch_loss / (predicted_targets + 1e-8)
+            bar_acc = epoch_accuracy / (predicted_targets + 1e-8)
+            bar.set_postfix(loss=bar_loss, acc=bar_acc, ppl=np.exp(bar_loss))
+            bar.update(num_graphs)
+
+        bar.close()
+
+        # Return epoch stats
+        mean_loss = epoch_loss / predicted_targets
+        mean_accuracy = epoch_accuracy / predicted_targets
+        instance_per_sec = processed_graphs / (time.time() - start_time)
+        epoch_perplexity = np.exp(mean_loss)
+
+        returns = (mean_loss, mean_accuracy, instance_per_sec, epoch_perplexity, epoch_actual_rt, epoch_optimal_rt)
+
+        if analysis_mode:
+            returns += (saved_outputs,)
+        return returns
+
+    def train(self):
+        log_to_save = []
+        total_time_start = time.time()
+
+        # we enter training after restore
+        if self.parent_run_id is not None:
+            print(f"== Epoch pre-validate epoch {self.current_epoch}")
+            _, valid_acc, _, ppl, _, _ = self.run_epoch(self.valid_data, "val")
+            best_val_acc = np.sum(valid_acc)
+            best_val_acc_epoch = self.current_epoch
+            print(
+                "\r\x1b[KResumed operation, initial cum. val. acc: %.5f, ppl %.5f"
+                % (best_val_acc, ppl)
+            )
+            self.current_epoch += 1
+        else:
+            (best_val_acc, best_val_acc_epoch) = (0.0, 0)
+
+        # Training loop over epochs
+        target_epoch = self.current_epoch + self.config.num_epochs
+        for epoch in range(self.current_epoch, target_epoch):
+            print(f"== Epoch {epoch}/{target_epoch}")
+
+            train_loss, train_acc, train_speed, train_ppl, train_art, train_ort = self.run_epoch(
+                self.train_data, "train"
+            )
+            print(
+                "\r\x1b[K Train: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f | runtime: %.1f opt: %.1f"
+                % (train_loss, f"{train_acc:.5f}", train_ppl, train_speed, train_art, train_ort)
+            )
+
+            valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort = self.run_epoch(
+                self.valid_data, "eval"
+            )
+            print(
+                "\r\x1b[K Valid: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f | runtime: %.1f opt: %.1f"
+                % (valid_loss, f"{valid_acc:.5f}", valid_ppl, valid_speed, valid_art, valid_ort)
+            )
+
+            # maybe run test epoch
+            if self.test_data is not None:
+                test_loss, test_acc, test_speed, test_ppl, _, _ = self.run_epoch(
+                    self.test_data, "eval"
+                )
+                print(
+                    "\r\x1b[K Test: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f"
+                    % (test_loss, f"{test_acc:.5f}", test_ppl, test_speed)
+                )
+
+            epoch_time = time.time() - total_time_start
+            self.current_epoch = epoch
+
+            log_entry = {
+                "epoch": epoch,
+                "time": epoch_time,
+                "train_results": (train_loss, train_acc, train_speed, train_ppl, train_art, train_ort),
+                "valid_results": (valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort),
+            }
+
+            if self.test_data is not None:
+                log_entry.update({"test_results": (test_loss, test_acc, test_speed, test_ppl)})
+
+            log_to_save.append(log_entry)
+
+            with open(self.log_file, "w") as f:
+                json.dump(log_to_save, f, indent=4)
+
+            # TODO: sum seems redundant if only one task is trained.
+            val_acc = np.sum(valid_acc)  # type: float
+            if val_acc > best_val_acc:
+                self.save_model(epoch, self.best_model_file)
+                print(
+                    "  (Best epoch so far, cum. val. acc increased to %.5f from %.5f. Saving to '%s')"
+                    % (val_acc, best_val_acc, self.best_model_file)
+                )
+                best_val_acc = val_acc
+                best_val_acc_epoch = epoch
+            elif epoch - best_val_acc_epoch >= self.config.patience:
+                print(
+                    "Stopping training after %i epochs without improvement on validation accuracy."
+                    % self.config.patience
+                )
+                break
+            if not self.args['--skip_save_every_epoch']:
+                self.save_model(epoch, self.last_model_file)
+        # save last model on finish of training
+        self.save_model(epoch, self.last_model_file)
+
+    def test(self):
+        log_to_save = []
+        total_time_start = time.time()
+
+        print(f"== Epoch: Test only run.")
+
+        valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort = self.run_epoch(
+            self.valid_data, "eval"
+        )
+        print(
+            "\r\x1b[K Valid: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f | runtime: %.1f opt: %.1f"
+            % (valid_loss, f"{valid_acc:.5f}", valid_ppl, valid_speed, valid_art, valid_ort)
+        )
+
+        if self.test_data is not None:
+            test_loss, test_acc, test_speed, test_ppl, _, _ = self.run_epoch(
+                self.test_data, "eval"
+            )
+            print(
+                "\r\x1b[K Test: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f"
+                % (test_loss, f"{test_acc:.5f}", test_ppl, test_speed)
+            )
+
+        epoch_time = time.time() - total_time_start
+
+        log_entry = {
+            "epoch": 'test_only',
+            "time": epoch_time,
+            "valid_results": (valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort),
+        }
+        if self.test_data is not None:
+            log_entry.update({"test_results": (test_loss, test_acc, test_speed, test_ppl)})
+
+        log_to_save.append(log_entry)
+        with open(self.log_file, "w") as f:
+            json.dump(log_to_save, f, indent=4)
+
+    def save_model(self, epoch, path):
+        checkpoint = {
+            'run_id': self.run_id,
+            'global_training_step': self.global_training_step,
+            'epoch': epoch,
+            'config': self.config.to_dict(),
+            'model_name': self.model.__class__.__name__,
+            'model_state_dict': self.model.state_dict(),
+            'optimizer_state_dict': self.model.opt.state_dict(),
+        }
+        torch.save(checkpoint, path)
+
+    def restore_by_pattern(self, pattern, log_dir, current_kfold_split=None):
+        """This method will restore the last checkpoint of a run that is identifiable by
+        the pattern <pattern>. It could restore to model_last or model_best.
+        However if current_kfold_split is given, it will additionally filter for this split!
+        Therefore the split should not be part of the pattern.
+        """
+        if current_kfold_split is not None:
+            checkpoints = list(log_dir.glob(f"*{pattern}*_{current_kfold_split}_model_*.p*"))
+        else:
+            checkpoints = list(log_dir.glob(f"*{pattern}*_model_*.p*"))
+        last_mod_checkpoint = sorted(checkpoints, key=os.path.getmtime)[-1]
+        assert last_mod_checkpoint.is_file(), f"Couldn't restore by jobname: No model files matching <{pattern}> found."
+        return self.restore_model(last_mod_checkpoint)
+
+    def restore_model(self, path):
+        """loads and restores a model from file."""
+        checkpoint = torch.load(path)
+        self.parent_run_id = checkpoint['run_id']
+        self.global_training_step = checkpoint['global_training_step']
+        self.current_epoch = checkpoint['epoch']
+
+        config_dict = checkpoint['config'] if isinstance(checkpoint['config'], dict) else checkpoint['config'].to_dict()
+
+        if not self.args.get('--skip_restore_config'):
+            # maybe zero out dropout attributes
+            if self.args['--transfer'] is not None and self.args['--transfer_mode'] == 'frozen':
+                for key, value in config_dict.items():
+                    if 'dropout' in key:
+                        config_dict[key] = 0.0
+                        print(f"*Restoring Config* Setting {key} from {value} to 0.0 while restoring config from checkpoint for transfer.")
+            config = getattr(configs, config_dict['name']).from_dict(config_dict)
+            self.config = config
+            print(f'*RESTORED* self.config = {config.name} from checkpoint {str(path)}.')
+        else:
+            print(f'Skipped restoring self.config from checkpoint!')
+            assert self.args.get('--model') is not None, "Can only use --skip_restore_config if --model is given."
+            # initialize config from --model and compare to skipped config from restore.
+            _, Config = MODEL_CLASSES[self.args['--model']]
+            self.config = Config.from_dict(self.parse_config_params(args))
+            self.config.check_equal(config_dict)
+
+        test_only = self.args.get('--test', False)
+        Model = getattr(modeling, checkpoint['model_name'])
+        model = Model(self.config, test_only=test_only)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        print(f'*RESTORED* model parameters from checkpoint {str(path)}.')
+        if not self.args.get('--test', None):  # only restore opt if needed. opt should be None o/w.
+            model.opt.load_state_dict(checkpoint['optimizer_state_dict'])
+            print(f'*RESTORED* optimizer parameters from checkpoint as well.')
+        return model
+
+    def transfer_model(self, transfer_model_class, mode):
+        """transfers the current model to a different model class.
+        Resets global_training_step and current_epoch.
+
+        Mode:
+            frozen - only the new readout module will receive gradients.
+            finetune - the whole network will receive gradients.
+        """
+        assert transfer_model_class in MODEL_CLASSES
+        self.global_training_step = 0
+        self.current_epoch = 1
+
+        # freeze layers
+        if mode == 'frozen':
+            for param in self.model.parameters():
+                param.requires_grad = False
+
+        # replace config
+        _, Config = MODEL_CLASSES[transfer_model_class]
+        params = self.parse_config_params(self.args)
+        self.config = Config.from_dict(params=params)
+
+        # replace readout
+        if getattr(self.config, 'has_aux_input', False) and getattr(self.config, 'aux_use_better', False):
+            self.model.readout = modeling.BetterAuxiliaryReadout(self.config)
+        elif getattr(self.config, 'has_aux_input', False):
+            self.model.readout = modeling.Readout(self.config)
+            self.model.aux_readout = modeling.AuxiliaryReadout(self.config)
+        else:
+            assert not getattr(self.config, 'aux_use_better', False), 'aux_use_better only with has_aux_input!'
+            self.model.readout = modeling.Readout(self.config)
+
+        # assign config to model
+        self.model.config = self.config
+
+        # re-setup model
+        test_only = self.args.get('--test', False)
+        assert not test_only, "Why transfer if you don't train? Here is not restoring a transferred model!!!"
+        self.model.setup(self.config, test_only)
+        # print info
+        print(self.model)
+        print(f"Number of trainable params in transferred model: {self.model.num_parameters()}")
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    print(args)
+    assert not (args['--config'] and args['--config_json']), "Can't decide which config to use!"
+    if args.get('--model'):
+        assert args.get('--model') in MODEL_CLASSES, f'Unknown model.'
+    if args.get('--dataset'):
+        assert args.get('--dataset') in DATASET_CLASSES, f'Unknown dataset.'
+
+    if not args['--kfold']:
+        learner = Learner(model=args['--model'], dataset=args['--dataset'], args=args)
+        learner.test() if args.get('--test') else learner.train()
+    else:  # kfold
+        if args['--dataset'] in ['devmap_amd', 'devmap_nvidia']: num_splits = 10
+        elif args['--dataset'] in ['threadcoarsening_Cypress', 'threadcoarsening_Kepler', 'threadcoarsening_Fermi', 'threadcoarsening_Tahiti']: num_splits = 17
+        elif args['--dataset'] in ['branch_prediction']: num_splits = 10
+        else: raise NotImplementedError("kfold not implemented for this dataset.")
+
+        for split in range(num_splits):
+            print(f"#######################################")
+            print(f"CURRENT SPLIT: {split} + 1/{num_splits}")
+            print(f"#######################################")
+            learner = Learner(model=args['--model'], dataset=args['--dataset'], args=args, current_kfold_split=split)
+            if len(learner.valid_data) == 0:
+                print('***'*20)
+                print(f'Validation Split is empty! Skipping split {split} + 1 / {num_splits}.')
+                print('***'*20)
+            learner.test() if args.get('--test') else learner.train()

From d037f83122ea1c785abfbe622fe5fceac4c97c0d Mon Sep 17 00:00:00 2001
From: Zacharias Fisches <zacharias.vf@gmail.com>
Date: Thu, 27 Aug 2020 10:52:06 +0200
Subject: [PATCH 2/5] imports adapted, some support docs and structure added -
 fully untested

github.com/ChrisCummins/ProGraML/issues/81
---
 .../graph_level_classification/.gitignore     |  2 +
 .../task/graph_level_classification/README.md | 51 +++++++++++++++++++
 .../graph_level_classification/configs.py     |  5 +-
 .../task/graph_level_classification/run.py    | 20 ++++----
 4 files changed, 65 insertions(+), 13 deletions(-)
 create mode 100644 programl/task/graph_level_classification/.gitignore
 create mode 100644 programl/task/graph_level_classification/README.md

diff --git a/programl/task/graph_level_classification/.gitignore b/programl/task/graph_level_classification/.gitignore
new file mode 100644
index 000000000..9aa91af40
--- /dev/null
+++ b/programl/task/graph_level_classification/.gitignore
@@ -0,0 +1,2 @@
+# don't track logs folder
+logs/
\ No newline at end of file
diff --git a/programl/task/graph_level_classification/README.md b/programl/task/graph_level_classification/README.md
new file mode 100644
index 000000000..ce71de2d1
--- /dev/null
+++ b/programl/task/graph_level_classification/README.md
@@ -0,0 +1,51 @@
+# Graph Level Classification
+
+Two subtasks are of particular interest: classifyapp a.k.a. poj104 and devmap a.k.a. heterogeneous device mapping.
+
+## Quickstart
+`python run.py --help` will print this help
+```
+Usage:
+   run.py [options]
+
+Options:
+    -h --help                       Show this screen.
+    --data_dir DATA_DIR             Directory(*) to of dataset. (*)=relative to repository root ProGraML/.
+                                        Will overwrite the per-dataset defaults if provided.
+
+    --log_dir LOG_DIR               Directory(*) to store logfiles and trained models relative to repository dir.
+                                        [default: programl/task/graph_level_classification/logs/unspecified]
+    --model MODEL                   The model to run.
+    --dataset DATASET               The dataset to us.
+    --config CONFIG                 Path(*) to a config json dump with params.
+    --config_json CONFIG_JSON       Config json with params.
+    --restore CHECKPOINT            Path(*) to a model file to restore from.
+    --skip_restore_config           Whether to skip restoring the config from CHECKPOINT.
+    --test                          Test the model without training.
+    --restore_by_pattern PATTERN    Restore newest model of this name from log_dir and
+                                        continue training. (AULT specific!)
+                                        PATTERN is a string that can be grep'ed for.
+    --kfold                         Run kfold cross-validation iff kfold is set.
+                                        Splits are currently dataset specific.
+    --transfer MODEL                The model-class to transfer to.
+                                    The args specified will be applied to the transferred model to the extend applicable, e.g.
+                                        training params and Readout module specifications, but not to the transferred model trunk.
+                                        However, we strongly recommend to make all trunk-parameters match, in order to be able
+                                        to restore from transferred checkpoints without having to pass a matching config manually.
+    --transfer_mode MODE            One of frozen, finetune (but not yet implemented) [default: frozen]
+                                        Mode frozen also sets all dropout in the restored model to zero (the newly initialized
+                                        readout function can have dropout nonetheless, depending on the config provided).
+    --skip_save_every_epoch         Save latest model after every epoch (on a rolling basis).
+```
+Therefore, an exemplary command could look like this:
+```
+[Example Run command]
+```
+NB: You can pass a double quoted string of config options in json format, except that you may use single quotes (they will be parsed as double quotes to transform this almost-json format into valid json)
+
+## How to reproduce results from the paper?
+
+```
+more run commands / another script that does it for us.
+```
+
diff --git a/programl/task/graph_level_classification/configs.py b/programl/task/graph_level_classification/configs.py
index f22b540c8..6b665910d 100644
--- a/programl/task/graph_level_classification/configs.py
+++ b/programl/task/graph_level_classification/configs.py
@@ -13,9 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""C."""
-from typing import List
-from deeplearning.ml4pl.poj104.dataset import AblationVocab
+"""Configs"""
+from .dataset import AblationVocab
 
 
 class ProGraMLBaseConfig(object):
diff --git a/programl/task/graph_level_classification/run.py b/programl/task/graph_level_classification/run.py
index 9ba355e04..9c399718d 100644
--- a/programl/task/graph_level_classification/run.py
+++ b/programl/task/graph_level_classification/run.py
@@ -1,3 +1,4 @@
+# TODO: decide on default log dir in docstring below.
 """
 Usage:
    run.py [options]
@@ -8,7 +9,7 @@
                                         Will overwrite the per-dataset defaults if provided.
 
     --log_dir LOG_DIR               Directory(*) to store logfiles and trained models relative to repository dir.
-                                        [default: deeplearning/ml4pl/poj104/logs/unspecified]
+                                        [default: programl/task/graph_level_classification/logs/unspecified]
     --model MODEL                   The model to run.
     --dataset DATASET               The dataset to us.
     --config CONFIG                 Path(*) to a config json dump with params.
@@ -33,17 +34,16 @@
 """
 
 
-import pickle, time, os, json, sys
+import time, os, json, sys
 from pathlib import Path
 
 from docopt import docopt
 import tqdm
 import numpy as np
 import torch
-from torch_geometric.data import Data, InMemoryDataset, DataLoader # (see below)
+from torch_geometric.data import DataLoader # (see below)
 
 # make this file executable from anywhere
-#if __name__ == '__main__':
 full_path = os.path.realpath(__file__)
 print(full_path)
 REPO_ROOT = full_path.rsplit('ProGraML', maxsplit=1)[0] + 'ProGraML'
@@ -52,13 +52,13 @@
 sys.path.insert(1, REPO_ROOT)
 REPO_ROOT = Path(REPO_ROOT)
 
-from deeplearning.ml4pl.poj104.dataloader import NodeLimitedDataLoader
+from .dataloader import NodeLimitedDataLoader
 
-from deeplearning.ml4pl.models.ggnn.modeling import (
+from .modeling import (
     GGNNModel,
     GraphTransformerModel,
 )
-from deeplearning.ml4pl.models.ggnn.configs import (
+from .configs import (
     ProGraMLBaseConfig,
     GGNN_POJ104_Config,
     GGNN_ForPretraining_Config,
@@ -72,7 +72,7 @@
     GraphTransformer_ForPretraining_Config,
 )
 
-from deeplearning.ml4pl.poj104.dataset import (
+from .dataset import (
     POJ104Dataset,
     NCCDataset,
     ThreadcoarseningDataset,
@@ -81,8 +81,8 @@
 )
 
 # Importing twice like this enables restoring
-from deeplearning.ml4pl.models.ggnn import modeling
-from deeplearning.ml4pl.models.ggnn import configs
+from . import modeling
+from . import configs
 
 
 

From 7df466f366a8306d8d4ce92a826a6fd167baa763 Mon Sep 17 00:00:00 2001
From: Chris Cummins <chrisc.101@gmail.com>
Date: Thu, 27 Aug 2020 12:05:28 +0100
Subject: [PATCH 3/5] Run isort+black on imported code.

github.com/ChrisCummins/ProGraML/issues/81
---
 .../task/graph_level_classification/README.md |   2 +-
 .../graph_level_classification/configs.py     |  70 +-
 .../graph_level_classification/dataloader.py  |  66 +-
 .../graph_level_classification/dataset.py     | 666 ++++++++------
 .../graph_level_classification/modeling.py    | 538 ++++++++----
 .../task/graph_level_classification/run.py    | 820 +++++++++++-------
 6 files changed, 1384 insertions(+), 778 deletions(-)

diff --git a/programl/task/graph_level_classification/README.md b/programl/task/graph_level_classification/README.md
index ce71de2d1..2c5d804e3 100644
--- a/programl/task/graph_level_classification/README.md
+++ b/programl/task/graph_level_classification/README.md
@@ -16,7 +16,7 @@ Options:
     --log_dir LOG_DIR               Directory(*) to store logfiles and trained models relative to repository dir.
                                         [default: programl/task/graph_level_classification/logs/unspecified]
     --model MODEL                   The model to run.
-    --dataset DATASET               The dataset to us.
+    --dataset DATASET               The dataset to use.
     --config CONFIG                 Path(*) to a config json dump with params.
     --config_json CONFIG_JSON       Config json with params.
     --restore CHECKPOINT            Path(*) to a model file to restore from.
diff --git a/programl/task/graph_level_classification/configs.py b/programl/task/graph_level_classification/configs.py
index 6b665910d..48ccc00a2 100644
--- a/programl/task/graph_level_classification/configs.py
+++ b/programl/task/graph_level_classification/configs.py
@@ -42,7 +42,7 @@ def __init__(self):
 
         self.vocab_size: int = 8568
         self.cdfg_vocab: bool = False
-        
+
         # ABLATION OPTIONS
         # NONE = 0 No ablation - use the full vocabulary (default).
         # NO_VOCAB = 1 Ignore the vocabulary - every node has an x value of 0.
@@ -56,7 +56,7 @@ def __init__(self):
         # this reduces the tokens that the network sees to only
         # !IDENTIFIERs and !UNK statements
         #  One of {zero, constant, random, random_const, finetune, none}
-        self.inst2vec_embeddings = 'random'
+        self.inst2vec_embeddings = "random"
 
         self.ablate_structure = None  # one of {control,data,call}
 
@@ -71,20 +71,29 @@ def from_dict(cls, params):
             if hasattr(config, key):
                 setattr(config, key, params[key])
             else:
-                print(f"(*CONFIG FROM DICT*  Default {config.name} doesn't have a key {key}. Will add key to config anyway!")
+                print(
+                    f"(*CONFIG FROM DICT*  Default {config.name} doesn't have a key {key}. Will add key to config anyway!"
+                )
                 setattr(config, key, params[key])
         return config
 
     def to_dict(self):
-        config_dict = {a: getattr(self, a) for a in dir(self) if not a.startswith('__') and not callable(getattr(self, a))}
+        config_dict = {
+            a: getattr(self, a)
+            for a in dir(self)
+            if not a.startswith("__") and not callable(getattr(self, a))
+        }
         return config_dict
 
     def check_equal(self, other):
         # take either config object or config_dict
         other_dict = other if isinstance(other, dict) else other.to_dict()
         if not self.to_dict() == other_dict:
-            print(f"WARNING: GGNNConfig.check_equal() FAILED:\nself and other are unequal: "
-                  f"The difference is {set(self.to_dict()) ^ set(other.to_dict())}.\n self={self.to_dict()}\n other={other_dict}")
+            print(
+                f"WARNING: GGNNConfig.check_equal() FAILED:\nself and other are unequal: "
+                f"The difference is {set(self.to_dict()) ^ set(other.to_dict())}.\n self={self.to_dict()}\n other={other_dict}"
+            )
+
 
 class GGNN_POJ104_Config(ProGraMLBaseConfig):
     def __init__(self):
@@ -94,8 +103,8 @@ def __init__(self):
         self.gnn_layers: int = 8
         self.message_weight_sharing: int = 2
         self.update_weight_sharing: int = 2
-        #self.message_timesteps: List[int] = [2, 2, 2, 2]
-        #self.update_timesteps: List[int] = [2, 2, 2, 2]
+        # self.message_timesteps: List[int] = [2, 2, 2, 2]
+        # self.update_timesteps: List[int] = [2, 2, 2, 2]
 
         # currently only admits node types 0 and 1 for statements and identifiers.
         self.use_node_types = True
@@ -121,7 +130,8 @@ def __init__(self):
         # self.selector_size: int = 2 if getattr(self, 'use_selector_embeddings', False) else 0
         # TODO(Zach) Maybe refactor non-rectangular edge passing matrices for independent hidden size.
         # hidden size of the whole model
-        self.hidden_size: int = self.emb_size + getattr(self, 'selector_size', 0)
+        self.hidden_size: int = self.emb_size + getattr(self, "selector_size", 0)
+
 
 class GGNN_Devmap_Config(GGNN_POJ104_Config):
     def __init__(self):
@@ -153,6 +163,7 @@ def __init__(self):
         self.has_graph_labels: bool = True
         # self.has_aux_input: bool = False
 
+
 class GGNN_ForPretraining_Config(GGNN_POJ104_Config):
     def __init__(self):
         super().__init__()
@@ -175,42 +186,43 @@ def __init__(self):
         ###### borrowed for debugging ##########
 
         # GGNNMessage Layer
-        #self.msg_mean_aggregation: bool = True
-        #self.use_edge_bias: bool = True
+        # self.msg_mean_aggregation: bool = True
+        # self.use_edge_bias: bool = True
 
         ###############
         self.backward_edges: bool = True
         self.gnn_layers: int = 8
         self.message_weight_sharing: int = 2
         self.update_weight_sharing: int = 2
-        #self.layer_timesteps: List[int] = [1, 1, 1, 1, 1, 1, 1, 1] #[2, 2, 2, 2]
+        # self.layer_timesteps: List[int] = [1, 1, 1, 1, 1, 1, 1, 1] #[2, 2, 2, 2]
         self.use_node_types: bool = False
 
         # Dataset Specific, don't change!
         self.num_classes: int = 104
         self.has_graph_labels: bool = True
-        self.hidden_size: int = self.emb_size + getattr(self, 'selector_size', 0)
+        self.hidden_size: int = self.emb_size + getattr(self, "selector_size", 0)
 
         # Message:
         self.position_embeddings: bool = True
         #  Self-Attn Layer
         self.attn_bias = True
-        self.attn_num_heads = 5 #8 # choose among 4,5,8,10 for emb_sz 200
+        self.attn_num_heads = 5  # 8 # choose among 4,5,8,10 for emb_sz 200
         self.attn_dropout = 0.1
         self.attn_v_pos = False
 
         # Update:
 
         # Transformer Update Layer
-        self.update_layer: str = 'ff' # or 'gru'
-        self.tfmr_act = 'gelu' # relu or gelu, default relu
-        self.tfmr_dropout = 0.2 # default 0.1
-        self.tfmr_ff_sz = 512 #512 # ~ 2.5 model_dim (Bert: 768 - 2048, Trfm: base 512 - 2048, big 1024 - 4096)
+        self.update_layer: str = "ff"  # or 'gru'
+        self.tfmr_act = "gelu"  # relu or gelu, default relu
+        self.tfmr_dropout = 0.2  # default 0.1
+        self.tfmr_ff_sz = 512  # 512 # ~ 2.5 model_dim (Bert: 768 - 2048, Trfm: base 512 - 2048, big 1024 - 4096)
 
         # Optionally: GGNN Update Layer
-        #self.update_layer: str = 'gru' # or 'ff'
-        #self.edge_weight_dropout: float = 0.0
-        #self.graph_state_dropout: float = 0.2
+        # self.update_layer: str = 'gru' # or 'ff'
+        # self.edge_weight_dropout: float = 0.0
+        # self.graph_state_dropout: float = 0.2
+
 
 class GraphTransformer_Devmap_Config(GraphTransformer_POJ104_Config):
     def __init__(self):
@@ -219,9 +231,9 @@ def __init__(self):
         self.batch_size = 64
         self.lr = 2.5e-4
         self.num_epochs = 600
-        #self.graph_state_dropout = 0.0 #GGNN only
-        
-        #self.output_dropout # <- applies to Readout func!
+        # self.graph_state_dropout = 0.0 #GGNN only
+
+        # self.output_dropout # <- applies to Readout func!
 
         # Auxiliary Readout
         self.aux_use_better = False
@@ -234,17 +246,19 @@ def __init__(self):
         self.num_classes: int = 2
         self.has_graph_labels: bool = True
         self.has_aux_input: bool = True
-    
+
+
 class GraphTransformer_Threadcoarsening_Config(GraphTransformer_POJ104_Config):
     def __init__(self):
         super().__init__()
-        self.lr = 5e-5 #2.5-4?
+        self.lr = 5e-5  # 2.5-4?
         self.num_epochs = 600
         # Dataset inherent, don't change!
         self.num_classes: int = 6
         self.has_graph_labels: bool = True
         # self.has_aux_input: bool = False
 
+
 class GraphTransformer_ForPretraining_Config(GraphTransformer_POJ104_Config):
     def __init__(self):
         super().__init__()
@@ -266,7 +280,7 @@ class GGNN_BranchPrediction_Config(GGNN_POJ104_Config):
     def __init__(self):
         super().__init__()
         self.batch_size = 4
-        #self.use_tanh_readout = False !
+        # self.use_tanh_readout = False !
         self.num_classes = 1
         self.has_graph_labels = False
 
@@ -275,6 +289,6 @@ class GraphTransformer_BranchPrediction_Config(GraphTransformer_POJ104_Config):
     def __init__(self):
         super().__init__()
         self.batch_size = 4
-        #self.use_tanh_readout = False !
+        # self.use_tanh_readout = False !
         self.num_classes = 1
         self.has_graph_labels = False
diff --git a/programl/task/graph_level_classification/dataloader.py b/programl/task/graph_level_classification/dataloader.py
index 0684b1b80..7f3c4414b 100644
--- a/programl/task/graph_level_classification/dataloader.py
+++ b/programl/task/graph_level_classification/dataloader.py
@@ -1,8 +1,7 @@
 import torch.utils.data
+from torch._six import container_abcs, int_classes, string_classes
 from torch.utils.data.dataloader import default_collate
-
-from torch_geometric.data import Data, Batch
-from torch._six import container_abcs, string_classes, int_classes
+from torch_geometric.data import Batch, Data
 
 
 class DataLoader(torch.utils.data.DataLoader):
@@ -18,8 +17,8 @@ class DataLoader(torch.utils.data.DataLoader):
         follow_batch (list or tuple, optional): Creates assignment batch
             vectors for each key in the list. (default: :obj:`[]`)
     """
-    def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[],
-                 **kwargs):
+
+    def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[], **kwargs):
         def collate(batch):
             elem = batch[0]
             if isinstance(elem, Data):
@@ -32,18 +31,21 @@ def collate(batch):
                 return batch
             elif isinstance(elem, container_abcs.Mapping):
                 return {key: collate([d[key] for d in batch]) for key in elem}
-            elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
+            elif isinstance(elem, tuple) and hasattr(elem, "_fields"):
                 return type(elem)(*(collate(s) for s in zip(*batch)))
             elif isinstance(elem, container_abcs.Sequence):
                 return [collate(s) for s in zip(*batch)]
+            raise TypeError(
+                "DataLoader found invalid type: {}".format(type(elem).__name__)
+            )
 
-            raise TypeError('DataLoader found invalid type: {}'.format(
-                type(elem)))
-
-        super(DataLoader,
-              self).__init__(dataset, batch_size, shuffle,
-                             collate_fn=lambda batch: collate(batch), **kwargs)
-
+        super().__init__(
+            dataset,
+            batch_size,
+            shuffle,
+            collate_fn=lambda batch: collate(batch),
+            **kwargs,
+        )
 
 
 class NodeLimitedDataLoader(torch.utils.data.DataLoader):
@@ -59,8 +61,17 @@ class NodeLimitedDataLoader(torch.utils.data.DataLoader):
         follow_batch (list or tuple, optional): Creates assignment batch
             vectors for each key in the list. (default: :obj:`[]`)
     """
-    def __init__(self, dataset, batch_size=1, shuffle=False, follow_batch=[],
-                 max_num_nodes=None, warn_on_limit=False, **kwargs):
+
+    def __init__(
+        self,
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        follow_batch=[],
+        max_num_nodes=None,
+        warn_on_limit=False,
+        **kwargs,
+    ):
         self.max_num_nodes = max_num_nodes
 
         def collate(batch):
@@ -75,12 +86,16 @@ def collate(batch):
                         if num_nodes + elem.num_nodes <= self.max_num_nodes:
                             limited_batch.append(elem)
                             num_nodes += elem.num_nodes
-                        else: # for debugging
+                        else:  # for debugging
                             pass
                     if len(limited_batch) < len(batch):
                         if warn_on_limit:
-                            print(f"dropped {len(batch) - len(limited_batch)} graphs from batch!")
-                    assert limited_batch != [], f'limited batch is empty! original batch was {batch}'
+                            print(
+                                f"dropped {len(batch) - len(limited_batch)} graphs from batch!"
+                            )
+                    assert (
+                        limited_batch != []
+                    ), f"limited batch is empty! original batch was {batch}"
                     return Batch.from_data_list(limited_batch, follow_batch)
                 else:
                     return Batch.from_data_list(batch, follow_batch)
@@ -92,14 +107,17 @@ def collate(batch):
                 return batch
             elif isinstance(elem, container_abcs.Mapping):
                 return {key: collate([d[key] for d in batch]) for key in elem}
-            elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
+            elif isinstance(elem, tuple) and hasattr(elem, "_fields"):
                 return type(elem)(*(collate(s) for s in zip(*batch)))
             elif isinstance(elem, container_abcs.Sequence):
                 return [collate(s) for s in zip(*batch)]
 
-            raise TypeError('DataLoader found invalid type: {}'.format(
-                type(elem)))
+            raise TypeError("DataLoader found invalid type: {}".format(type(elem)))
 
-        super(NodeLimitedDataLoader,
-              self).__init__(dataset, batch_size, shuffle,
-                             collate_fn=lambda batch: collate(batch), **kwargs)
+        super().__init__(
+            dataset,
+            batch_size,
+            shuffle,
+            collate_fn=lambda batch: collate(batch),
+            **kwargs,
+        )
diff --git a/programl/task/graph_level_classification/dataset.py b/programl/task/graph_level_classification/dataset.py
index ed645735d..7b664a6db 100644
--- a/programl/task/graph_level_classification/dataset.py
+++ b/programl/task/graph_level_classification/dataset.py
@@ -21,10 +21,10 @@
 # make this file executable from anywhere
 
 full_path = os.path.realpath(__file__)
-#print(full_path)
-REPO_ROOT = full_path.rsplit('ProGraML', maxsplit=1)[0] + 'ProGraML'
-#print(REPO_ROOT)
-#insert at 1, 0 is the script path (or '' in REPL)
+# print(full_path)
+REPO_ROOT = full_path.rsplit("ProGraML", maxsplit=1)[0] + "ProGraML"
+# print(REPO_ROOT)
+# insert at 1, 0 is the script path (or '' in REPL)
 sys.path.insert(1, REPO_ROOT)
 REPO_ROOT = Path(REPO_ROOT)
 
@@ -62,26 +62,26 @@ def load(file: str, cdfg: bool = False) -> ProgramGraph:
         orig_graph: the original programl proto (that contains graph level labels)
     """
     graph = ProgramGraph()
-    with open(file, 'rb') as f:
+    with open(file, "rb") as f:
         proto = f.read()
 
-    
     if cdfg:
         # hotfix missing graph labels in cdfg proto
         orig_graph = ProgramGraph()
         orig_graph.ParseFromString(proto)
 
         graph2cdfg = subprocess.Popen(
-            [str(GRAPH2CDFG), '--stdin_fmt=pb', '--stdout_fmt=pb'],
-            stdin=subprocess.PIPE, stdout=subprocess.PIPE
+            [str(GRAPH2CDFG), "--stdin_fmt=pb", "--stdout_fmt=pb"],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
         )
         proto, _ = graph2cdfg.communicate(proto)
         assert not graph2cdfg.returncode, f"CDFG conversion failed: {file}"
 
     graph.ParseFromString(proto)
-    
+
     if not cdfg:
-        orig_graph = graph    
+        orig_graph = graph
     return graph, orig_graph
 
 
@@ -112,10 +112,8 @@ class AblationVocab(enum.IntEnum):
 
 
 def filename(
-        split: str,
-        cdfg: bool = False,
-        ablation_vocab: AblationVocab = AblationVocab.NONE
-    ) -> str:
+    split: str, cdfg: bool = False, ablation_vocab: AblationVocab = AblationVocab.NONE
+) -> str:
     """Generate the name for a data file.
 
     Args:
@@ -138,12 +136,14 @@ def filename(
     return f"{name}_data.pt"
 
 
-def nx2data(graph: ProgramGraph,
-            vocabulary: Dict[str, int],
-            y_feature_name: Optional[str] = None,
-            ignore_profile_info=True,
-            ablate_vocab = AblationVocab.NONE,
-            orig_graph: ProgramGraph = None):
+def nx2data(
+    graph: ProgramGraph,
+    vocabulary: Dict[str, int],
+    y_feature_name: Optional[str] = None,
+    ignore_profile_info=True,
+    ablate_vocab=AblationVocab.NONE,
+    orig_graph: ProgramGraph = None,
+):
     r"""Converts a program graph protocol buffer to a
     :class:`torch_geometric.data.Data` instance.
 
@@ -168,8 +168,7 @@ def nx2data(graph: ProgramGraph,
     # collect x
     if ablate_vocab == AblationVocab.NONE:
         vocabulary_indices = vocab_ids = [
-            vocabulary.get(node.text, len(vocabulary))
-            for node in graph.node
+            vocabulary.get(node.text, len(vocabulary)) for node in graph.node
         ]
     elif ablate_vocab == AblationVocab.NO_VOCAB:
         vocabulary_indices = [0] * len(graph.node)
@@ -183,31 +182,41 @@ def nx2data(graph: ProgramGraph,
 
     x = torch.cat([xs, types]).view(2, -1).t().contiguous()
 
-    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
+    assert (
+        edge_attr.size()[0] == edge_index.size()[1]
+    ), f"edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}"
 
     data_dict = {
-        'x': x,
-        'edge_index': edge_index,
-        'edge_attr': edge_attr,
+        "x": x,
+        "edge_index": edge_index,
+        "edge_attr": edge_attr,
     }
-    
+
     # maybe collect these data too
     if y_feature_name is not None:
         assert orig_graph is not None, "need orig_graph to retrieve graph level labels!"
-        y = torch.tensor(orig_graph.features.feature[y_feature_name].int64_list.value[0]).view(1)  # <1>
+        y = torch.tensor(
+            orig_graph.features.feature[y_feature_name].int64_list.value[0]
+        ).view(
+            1
+        )  # <1>
         if y_feature_name == "poj104_label":
             y -= 1
-        data_dict['y'] = y
-    
+        data_dict["y"] = y
+
     # branch prediction / profile info specific
     if not ignore_profile_info:
-        raise NotImplementedError("profile info is not supported with the new nx2data (from programgraph) adaptation.")
+        raise NotImplementedError(
+            "profile info is not supported with the new nx2data (from programgraph) adaptation."
+        )
         profile_info = []
         for i, node_data in nx_graph.nodes(data=True):
             # default to -1, -1, -1 if not all profile info is given.
-            if not (node_data.get("llvm_profile_true_weight") is not None and \
-                    node_data.get("llvm_profile_false_weight") is not None and \
-                    node_data.get("llvm_profile_total_weight") is not None):
+            if not (
+                node_data.get("llvm_profile_true_weight") is not None
+                and node_data.get("llvm_profile_false_weight") is not None
+                and node_data.get("llvm_profile_total_weight") is not None
+            ):
                 mask = 0
                 true_weight = -1
                 false_weight = -1
@@ -219,10 +228,9 @@ def nx2data(graph: ProgramGraph,
                 total_weight = node_data["llvm_profile_total_weight"]
 
             profile_info.append([mask, true_weight, false_weight, total_weight])
-        
-        data_dict['profile_info'] = torch.tensor(profile_info)
-    
-    
+
+        data_dict["profile_info"] = torch.tensor(profile_info)
+
     # make Data
     data = Data(**data_dict)
 
@@ -230,11 +238,15 @@ def nx2data(graph: ProgramGraph,
 
 
 class BranchPredictionDataset(InMemoryDataset):
-    def __init__(self, root='deeplearning/ml4pl/poj104/branch_prediction_data',
-                 split='train',
-                 transform=None, pre_transform=None,
-                 train_subset=[0, 100],
-                 train_subset_seed=0):
+    def __init__(
+        self,
+        root="deeplearning/ml4pl/poj104/branch_prediction_data",
+        split="train",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+    ):
         """
         Args:
             train_subset: [start_percentile, stop_percentile)    default [0,100).
@@ -246,7 +258,9 @@ def __init__(self, root='deeplearning/ml4pl/poj104/branch_prediction_data',
         self.train_subset_seed = train_subset_seed
         super().__init__(root, transform, pre_transform)
 
-        assert split in ['train'], "The BranchPrediction dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
+        assert split in [
+            "train"
+        ], "The BranchPrediction dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
         self.data, self.slices = torch.load(self.processed_paths[0])
         pass
 
@@ -258,13 +272,15 @@ def raw_file_names(self):
     @property
     def processed_file_names(self):
         """A list of files in the processed_dir which needs to be found in order to skip the processing."""
-        base = f'{self.split}_data.pt'
+        base = f"{self.split}_data.pt"
 
-        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+        if tuple(self.train_subset) == (0, 100) or self.split in ["val", "test"]:
             return [base]
         else:
-            assert self.split == 'train'
-            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            assert self.split == "train"
+            return [
+                f"{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         """Download raw data to `self.raw_dir`"""
@@ -276,13 +292,14 @@ def _save_train_subset(self):
         with a fixed random permutation with self.train_subset_seed.
         """
         import numpy as np
+
         perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
 
         # take slice of perm according to self.train_subset
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(30, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(30, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -291,11 +308,14 @@ def _save_train_subset(self):
         return
 
     def return_cross_validation_splits(self, split):
-        assert self.train_subset == [0, 100], "Do cross-validation on the whole dataset!"
-        #num_samples = len(self)
-        #perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
-
-         # 10-fold cross-validation
+        assert self.train_subset == [
+            0,
+            100,
+        ], "Do cross-validation on the whole dataset!"
+        # num_samples = len(self)
+        # perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # 10-fold cross-validation
         n_splits = 10
         kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
         (train_index, test_index) = list(kf.split(range(len(self))))[split]
@@ -309,7 +329,9 @@ def filter_max_num_nodes(self, max_num_nodes):
             if d.num_nodes <= max_num_nodes:
                 idx.append(i)
         dataset = self.__indexing__(idx)
-        print(f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining.")
+        print(
+            f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining."
+        )
         return dataset
 
     def process(self):
@@ -321,35 +343,39 @@ def process(self):
             Instead of looking for .ll.pickle (nx graphs), we directly look for '*.data.p' files.
         """
         # check if we need to create the full dataset:
-        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        full_dataset = Path(self.processed_dir) / f"{self.split}_data.pt"
         if full_dataset.is_file():
-            assert self.split == 'train', 'here shouldnt be reachable.'
-            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            assert self.split == "train", "here shouldnt be reachable."
+            print(
+                f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
 
         # read data into huge `Data` list.
         data_list = []
 
         ds_base = Path(self.root)
-        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        print(f"Creating {self.split} dataset at {str(ds_base)}")
         # TODO change this line to go to the new format
-        #out_base = ds_base / ('ir_' + self.split + '_programl')
-        #assert out_base.exists(), f"{out_base} doesn't exist!"
+        # out_base = ds_base / ('ir_' + self.split + '_programl')
+        # assert out_base.exists(), f"{out_base} doesn't exist!"
         # TODO collect .ll.pickle instead and call nx2data on the fly!
         print(f"=== DATASET {str(ds_base)}: Collecting .data.p files into dataset")
 
-        #files = list(ds_base.rglob('*.data.p'))
-        #files = list(ds_base.rglob('*.ll.pickle'))
-        files = list(ds_base.rglob('*.ll.p'))
-        
+        # files = list(ds_base.rglob('*.data.p'))
+        # files = list(ds_base.rglob('*.ll.pickle'))
+        files = list(ds_base.rglob("*.ll.p"))
+
         for file in tqdm.tqdm(files):
             if not file.is_file():
                 continue
@@ -362,14 +388,18 @@ def process(self):
                 data = nx2data(nx_graph, ignore_profile_info=False)
                 data_list.append(data)
             except IndexError:
-                print(f"Failing nx2data bc IndexError (prob. empty graph) on {file}! Skipping ...")
+                print(
+                    f"Failing nx2data bc IndexError (prob. empty graph) on {file}! Skipping ..."
+                )
                 continue
 
         print(f" * COMPLETED * === DATASET {ds_base}: now pre-filtering...")
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -379,17 +409,23 @@ def process(self):
         torch.save((self.data, self.slices), processed_path)
 
         # maybe save train_subset as well
-        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+        if not tuple(self.train_subset) == (0, 100) and self.split not in [
+            "val",
+            "test",
+        ]:
             self._save_train_subset()
 
 
-
 class NCCDataset(InMemoryDataset):
-    def __init__(self, root=REPO_ROOT / 'deeplearning/ml4pl/poj104/ncc_data',
-                 split='train',
-                 transform=None, pre_transform=None,
-                 train_subset=[0, 100],
-                 train_subset_seed=0):
+    def __init__(
+        self,
+        root=REPO_ROOT / "deeplearning/ml4pl/poj104/ncc_data",
+        split="train",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+    ):
         """
         NCC dataset
 
@@ -404,7 +440,9 @@ def __init__(self, root=REPO_ROOT / 'deeplearning/ml4pl/poj104/ncc_data',
         self.train_subset_seed = train_subset_seed
         super().__init__(root, transform, pre_transform)
 
-        assert split in ['train'], "The NCC dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
+        assert split in [
+            "train"
+        ], "The NCC dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
         self.data, self.slices = torch.load(self.processed_paths[0])
 
     @property
@@ -415,13 +453,15 @@ def raw_file_names(self):
     @property
     def processed_file_names(self):
         """A list of files in the processed_dir which needs to be found in order to skip the processing."""
-        base = f'{self.split}_data.pt'
+        base = f"{self.split}_data.pt"
 
-        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+        if tuple(self.train_subset) == (0, 100) or self.split in ["val", "test"]:
             return [base]
         else:
-            assert self.split == 'train'
-            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            assert self.split == "train"
+            return [
+                f"{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         """Download raw data to `self.raw_dir`"""
@@ -433,13 +473,14 @@ def _save_train_subset(self):
         with a fixed random permutation with self.train_subset_seed.
         """
         import numpy as np
+
         perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
 
         # take slice of perm according to self.train_subset
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(30, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(30, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -453,7 +494,9 @@ def filter_max_num_nodes(self, max_num_nodes):
             if d.num_nodes <= max_num_nodes:
                 idx.append(i)
         dataset = self.__indexing__(idx)
-        print(f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining.")
+        print(
+            f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining."
+        )
         return dataset
 
     def process(self):
@@ -465,35 +508,39 @@ def process(self):
             Instead of looking for .ll.pickle (nx graphs), we directly look for '*.data.p' files.
         """
         # check if we need to create the full dataset:
-        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        full_dataset = Path(self.processed_dir) / f"{self.split}_data.pt"
         if full_dataset.is_file():
-            assert self.split == 'train', 'here shouldnt be reachable.'
-            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            assert self.split == "train", "here shouldnt be reachable."
+            print(
+                f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
 
         # read data into huge `Data` list.
         data_list = []
 
         ds_base = Path(self.root)
-        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        print(f"Creating {self.split} dataset at {str(ds_base)}")
         # TODO change this line to go to the new format
-        #out_base = ds_base / ('ir_' + self.split + '_programl')
-        #assert out_base.exists(), f"{out_base} doesn't exist!"
+        # out_base = ds_base / ('ir_' + self.split + '_programl')
+        # assert out_base.exists(), f"{out_base} doesn't exist!"
         # TODO collect .ll.pickle instead and call nx2data on the fly!
         print(f"=== DATASET {str(ds_base)}: Collecting .data.p files into dataset")
 
-        #files = list(ds_base.rglob('*.data.p'))
-        #files = list(ds_base.rglob('*.ll.pickle'))
-        files = list(ds_base.rglob('*.ll.p'))
-        
+        # files = list(ds_base.rglob('*.data.p'))
+        # files = list(ds_base.rglob('*.ll.pickle'))
+        files = list(ds_base.rglob("*.ll.p"))
+
         for file in tqdm.tqdm(files):
             if not file.is_file():
                 continue
@@ -506,14 +553,18 @@ def process(self):
                 data = nx2data(nx_graph)
                 data_list.append(data)
             except IndexError:
-                print(f"Failing nx2data bc IndexError (prob. empty graph) on {file}! Skipping ...")
+                print(
+                    f"Failing nx2data bc IndexError (prob. empty graph) on {file}! Skipping ..."
+                )
                 continue
 
         print(f" * COMPLETED * === DATASET {ds_base}: now pre-filtering...")
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -523,18 +574,23 @@ def process(self):
         torch.save((self.data, self.slices), processed_path)
 
         # maybe save train_subset as well
-        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+        if not tuple(self.train_subset) == (0, 100) and self.split not in [
+            "val",
+            "test",
+        ]:
             self._save_train_subset()
 
 
-
-
 class LegacyNCCDataset(InMemoryDataset):
-    def __init__(self, root='deeplearning/ml4pl/poj104/unsupervised_ncc_data',
-                 split='train',
-                 transform=None, pre_transform=None,
-                 train_subset=[0, 100],
-                 train_subset_seed=0):
+    def __init__(
+        self,
+        root="deeplearning/ml4pl/poj104/unsupervised_ncc_data",
+        split="train",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+    ):
         """
         Args:
             train_subset: [start_percentile, stop_percentile)    default [0,100).
@@ -547,7 +603,9 @@ def __init__(self, root='deeplearning/ml4pl/poj104/unsupervised_ncc_data',
         self.train_subset_seed = train_subset_seed
         super().__init__(root, transform, pre_transform)
 
-        assert split in ['train'], "The NCC dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
+        assert split in [
+            "train"
+        ], "The NCC dataset only has a 'train' split. use train_subset=[0,x] and [x, 100] for training and testing."
         self.data, self.slices = torch.load(self.processed_paths[0])
 
     @property
@@ -558,13 +616,15 @@ def raw_file_names(self):
     @property
     def processed_file_names(self):
         """A list of files in the processed_dir which needs to be found in order to skip the processing."""
-        base = f'{self.split}_data.pt'
+        base = f"{self.split}_data.pt"
 
-        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+        if tuple(self.train_subset) == (0, 100) or self.split in ["val", "test"]:
             return [base]
         else:
-            assert self.split == 'train'
-            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            assert self.split == "train"
+            return [
+                f"{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         """Download raw data to `self.raw_dir`"""
@@ -576,13 +636,14 @@ def _save_train_subset(self):
         with a fixed random permutation with self.train_subset_seed.
         """
         import numpy as np
+
         perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
 
         # take slice of perm according to self.train_subset
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(30, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(30, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -596,7 +657,9 @@ def filter_max_num_nodes(self, max_num_nodes):
             if d.num_nodes <= max_num_nodes:
                 idx.append(i)
         dataset = self.__indexing__(idx)
-        print(f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining.")
+        print(
+            f"Filtering out graphs larger than {max_num_nodes} yields a dataset with {len(dataset)}/{len(self)} samples remaining."
+        )
         return dataset
 
     def process(self):
@@ -608,32 +671,36 @@ def process(self):
             Instead of looking for .ll.pickle (nx graphs), we directly look for '*.data.p' files.
         """
         # check if we need to create the full dataset:
-        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        full_dataset = Path(self.processed_dir) / f"{self.split}_data.pt"
         if full_dataset.is_file():
-            assert self.split == 'train', 'here shouldnt be reachable.'
-            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            assert self.split == "train", "here shouldnt be reachable."
+            print(
+                f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
 
         # read data into huge `Data` list.
         data_list = []
 
         ds_base = Path(self.root)
-        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        print(f"Creating {self.split} dataset at {str(ds_base)}")
         # TODO change this line to go to the new format
-        #out_base = ds_base / ('ir_' + self.split + '_programl')
-        #assert out_base.exists(), f"{out_base} doesn't exist!"
+        # out_base = ds_base / ('ir_' + self.split + '_programl')
+        # assert out_base.exists(), f"{out_base} doesn't exist!"
         # TODO collect .ll.pickle instead and call nx2data on the fly!
         print(f"=== DATASET {str(ds_base)}: Collecting .data.p files into dataset")
 
-        files = list(ds_base.rglob('*.data.p'))
+        files = list(ds_base.rglob("*.data.p"))
         for file in tqdm.tqdm(files):
             if not file.is_file():
                 continue
@@ -644,7 +711,9 @@ def process(self):
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET {ds_base}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -654,16 +723,23 @@ def process(self):
         torch.save((self.data, self.slices), processed_path)
 
         # maybe save train_subset as well
-        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+        if not tuple(self.train_subset) == (0, 100) and self.split not in [
+            "val",
+            "test",
+        ]:
             self._save_train_subset()
 
 
-
 class ThreadcoarseningDataset(InMemoryDataset):
-    def __init__(self, root='deeplearning/ml4pl/poj104/threadcoarsening_data',
-                 split='fail_fast',
-                 transform=None, pre_transform=None,
-                 train_subset=[0, 100], train_subset_seed=0):
+    def __init__(
+        self,
+        root="deeplearning/ml4pl/poj104/threadcoarsening_data",
+        split="fail_fast",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+    ):
         """
         Args:
             train_subset: [start_percentile, stop_percentile)    default [0,100).
@@ -672,7 +748,12 @@ def __init__(self, root='deeplearning/ml4pl/poj104/threadcoarsening_data',
             split: 'amd' or 'nvidia'
 
         """
-        assert split in ["Cypress", "Tahiti", "Fermi", "Kepler"], f"Split is {split}, but has to be 'Cypress', 'Tahiti', 'Fermi', or  'Kepler'"
+        assert split in [
+            "Cypress",
+            "Tahiti",
+            "Fermi",
+            "Kepler",
+        ], f"Split is {split}, but has to be 'Cypress', 'Tahiti', 'Fermi', or  'Kepler'"
         self.split = split
         self.train_subset = train_subset
         self.train_subset_seed = train_subset_seed
@@ -682,24 +763,31 @@ def __init__(self, root='deeplearning/ml4pl/poj104/threadcoarsening_data',
 
     @property
     def raw_file_names(self):
-        return 'threadcoarsening_data.zip'
+        return "threadcoarsening_data.zip"
 
     @property
     def processed_file_names(self):
-        base = f'{self.split}_data.pt'
+        base = f"{self.split}_data.pt"
 
         if tuple(self.train_subset) == (0, 100):
             return [base]
         else:
-            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            return [
+                f"{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         # download to self.raw_dir
         pass
 
     def return_cross_validation_splits(self, split):
-        assert self.train_subset == [0, 100], "Do cross-validation on the whole dataset!"
-        assert split <= 16 and split >= 0, f"This dataset shall be 17-fold (leave one out) cross-validated, but split={split}."
+        assert self.train_subset == [
+            0,
+            100,
+        ], "Do cross-validation on the whole dataset!"
+        assert (
+            split <= 16 and split >= 0
+        ), f"This dataset shall be 17-fold (leave one out) cross-validated, but split={split}."
         # leave one out
         n_splits = 17
         train_idx = list(range(n_splits))
@@ -708,20 +796,20 @@ def return_cross_validation_splits(self, split):
         test_data = self.__indexing__([split])
         return train_data, test_data
 
-
     def _save_train_subset(self):
         """saves a train_subset of self to file.
         Percentile slice is taken according to self.train_subset
         with a fixed random permutation with self.train_subset_seed.
         """
         import numpy as np
+
         perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
 
         # take slice of perm according to self.train_subset
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(100, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -743,18 +831,22 @@ def platform2str(self, platform):
 
     def _get_all_runtimes(self, platform, df, oracles):
         all_runtimes = {}
-        for kernel in oracles['kernel']:
+        for kernel in oracles["kernel"]:
             kernel_r = []
             for cf in [1, 2, 4, 8, 16, 32]:
-                row = df[(df['kernel'] == kernel) & (df['cf'] == cf)]
+                row = df[(df["kernel"] == kernel) & (df["cf"] == cf)]
                 if len(row) == 1:
-                    value = float(row[f'runtime_{platform}'].values)
+                    value = float(row[f"runtime_{platform}"].values)
                     if math.isnan(value):
-                        print(f"WARNING: Dataset contain NaN value (missing entry in runtimes most likely). kernel={kernel}, cf={cf}, value={row}.Replacing by infinity!.")
-                        value = float('inf')
+                        print(
+                            f"WARNING: Dataset contain NaN value (missing entry in runtimes most likely). kernel={kernel}, cf={cf}, value={row}.Replacing by infinity!."
+                        )
+                        value = float("inf")
                     kernel_r.append(value)
                 elif len(row) == 0:
-                    print(f' kernel={kernel:>20} is missing cf={cf}. Ad-hoc inserting result from last existing coarsening factor.')
+                    print(
+                        f" kernel={kernel:>20} is missing cf={cf}. Ad-hoc inserting result from last existing coarsening factor."
+                    )
                     kernel_r.append(kernel_r[-1])
                 else:
                     raise
@@ -763,17 +855,21 @@ def _get_all_runtimes(self, platform, df, oracles):
 
     def process(self):
         # check if we need to create the full dataset:
-        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        full_dataset = Path(self.processed_dir) / f"{self.split}_data.pt"
         if full_dataset.is_file():
-            print(f"Full dataset {full_dataset.name} found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            print(
+                f"Full dataset {full_dataset.name} found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
 
         root = Path(self.root)
@@ -784,18 +880,22 @@ def process(self):
         runtimes_file = root / "pact-2014-runtimes.csv"
         df = pd.read_csv(runtimes_file)
 
-        print('\tReading data from', oracle_file, '\n\tand', runtimes_file)
+        print("\tReading data from", oracle_file, "\n\tand", runtimes_file)
 
         # get all runtime info per kernel
         runtimes = self._get_all_runtimes(self.split, df=df, oracles=oracles)
 
         # get oracle labels
         cfs = [1, 2, 4, 8, 16, 32]
-        y = np.array([cfs.index(int(x)) for x in oracles["cf_" + self.split]], dtype=np.int64)
+        y = np.array(
+            [cfs.index(int(x)) for x in oracles["cf_" + self.split]], dtype=np.int64
+        )
 
         # sanity check oracles against min runtimes
         for i, (k, v) in enumerate(runtimes.items()):
-            assert int(y[i]) == np.argmin(v), f"{i}: {k} {v}, argmin(v): {np.argmin(v)} vs. oracles data {int(y[i])}."
+            assert int(y[i]) == np.argmin(
+                v
+            ), f"{i}: {k} {v}, argmin(v): {np.argmin(v)} vs. oracles data {int(y[i])}."
 
         # Add attributes to graphs
         data_list = []
@@ -803,26 +903,30 @@ def process(self):
         kernels = oracles["kernel"].values  # list of strings of kernel names
 
         for kernel in kernels:
-            #legacy
-            #file = root / 'kernels_ir_programl' / (kernel + '.data.p')
-            file = root / 'kernels_ir' / (kernel + '.ll.p')
-            assert file.exists(), f'input file not found: {file}'
-            #with open(file, 'rb') as f:
+            # legacy
+            # file = root / 'kernels_ir_programl' / (kernel + '.data.p')
+            file = root / "kernels_ir" / (kernel + ".ll.p")
+            assert file.exists(), f"input file not found: {file}"
+            # with open(file, 'rb') as f:
             #    data = pickle.load(f)
             g = load(file)
             data = nx2data(g)
             # add attributes
-            data['y'] = torch.tensor([np.argmin(runtimes[kernel])], dtype=torch.long)
-            data['runtimes'] = torch.tensor([runtimes[kernel]])
+            data["y"] = torch.tensor([np.argmin(runtimes[kernel])], dtype=torch.long)
+            data["runtimes"] = torch.tensor([runtimes[kernel]])
             data_list.append(data)
 
         ##################################
 
-        print(f" * COMPLETED * === DATASET Threadcoarsening-{self.split}: now pre-filtering...")
+        print(
+            f" * COMPLETED * === DATASET Threadcoarsening-{self.split}: now pre-filtering..."
+        )
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET Threadcoarsening-{self.split}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET Threadcoarsening-{self.split}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -831,16 +935,25 @@ def process(self):
         torch.save((self.data, self.slices), processed_path)
 
         # maybe save train_subset as well
-        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+        if not tuple(self.train_subset) == (0, 100) and self.split not in [
+            "val",
+            "test",
+        ]:
             self._save_train_subset()
 
 
-
 class DevmapDataset(InMemoryDataset):
-    def __init__(self, root='deeplearning/ml4pl/poj104/devmap_data',
-                 split='fail', transform=None, pre_transform=None,
-                 train_subset=[0, 100], train_subset_seed=0, cdfg: bool = False,
-                 ablation_vocab: AblationVocab = AblationVocab.NONE):
+    def __init__(
+        self,
+        root="deeplearning/ml4pl/poj104/devmap_data",
+        split="fail",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+        cdfg: bool = False,
+        ablation_vocab: AblationVocab = AblationVocab.NONE,
+    ):
         """
         Args:
             train_subset: [start_percentile, stop_percentile)    default [0,100).
@@ -849,7 +962,10 @@ def __init__(self, root='deeplearning/ml4pl/poj104/devmap_data',
             split: 'amd' or 'nvidia'
             cdfg: Use CDFG graph representation.
         """
-        assert split in ['amd', 'nvidia'], f"Split is {split}, but has to be 'amd' or 'nvidia'"
+        assert split in [
+            "amd",
+            "nvidia",
+        ], f"Split is {split}, but has to be 'amd' or 'nvidia'"
         self.split = split
         self.train_subset = train_subset
         self.train_subset_seed = train_subset_seed
@@ -861,7 +977,7 @@ def __init__(self, root='deeplearning/ml4pl/poj104/devmap_data',
 
     @property
     def raw_file_names(self):
-        return 'devmap_data.zip'
+        return "devmap_data.zip"
 
     @property
     def processed_file_names(self):
@@ -870,18 +986,23 @@ def processed_file_names(self):
         if tuple(self.train_subset) == (0, 100):
             return [base]
         else:
-            return [f'{name}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            return [
+                f"{name}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         # download to self.raw_dir
         pass
 
     def return_cross_validation_splits(self, split):
-        assert self.train_subset == [0, 100], "Do cross-validation on the whole dataset!"
-        #num_samples = len(self)
-        #perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
-
-         # 10-fold cross-validation
+        assert self.train_subset == [
+            0,
+            100,
+        ], "Do cross-validation on the whole dataset!"
+        # num_samples = len(self)
+        # perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
+
+        # 10-fold cross-validation
         n_splits = 10
         kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
         (train_index, test_index) = list(kf.split(self.data.y, self.data.y))[split]
@@ -900,7 +1021,7 @@ def _save_train_subset(self):
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(100, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -913,18 +1034,21 @@ def process(self):
         name = filename(self.split, self.cdfg, self.ablation_vocab)
         full_dataset = Path(self.processed_dir) / name
         if full_dataset.is_file():
-            print(f"Full dataset {full_dataset.name} found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            print(
+                f"Full dataset {full_dataset.name} found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
 
-
         vocab = load_vocabulary(CDFG_VOCABULARY if self.cdfg else PROGRAML_VOCABULARY)
         assert len(vocab) > 0, "vocab is empty :|"
 
@@ -934,7 +1058,7 @@ def process(self):
         input_files = list((root / f"graphs_{self.split}").iterdir())
 
         num_files = len(input_files)
-        print('\n--- Preparing to read', num_files, 'input files')
+        print("\n--- Preparing to read", num_files, "input files")
 
         # read data into huge `Data` list.
 
@@ -945,18 +1069,22 @@ def process(self):
             proto, _ = load(filename, cdfg=self.cdfg)
             data = nx2data(proto, vocabulary=vocab, ablate_vocab=self.ablation_vocab)
 
-            # graph2cdfg conversion drops the graph features, so we may have to 
+            # graph2cdfg conversion drops the graph features, so we may have to
             # reload the graph.
             if self.cdfg:
                 proto = load(filename)
 
             # Add the features and label.
             proto_features = proto.features.feature
-            data['y'] = torch.tensor(proto_features["devmap_label"].int64_list.value[0]).view(1)
-            data['aux_in'] = torch.tensor([
-                proto_features["transfer_bytes"].int64_list.value[0],
-                proto_features["wgsize"].int64_list.value[0],
-            ])
+            data["y"] = torch.tensor(
+                proto_features["devmap_label"].int64_list.value[0]
+            ).view(1)
+            data["aux_in"] = torch.tensor(
+                [
+                    proto_features["transfer_bytes"].int64_list.value[0],
+                    proto_features["wgsize"].int64_list.value[0],
+                ]
+            )
 
             data_list.append(data)
 
@@ -966,7 +1094,9 @@ def process(self):
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET Devmap-{name}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET Devmap-{name}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -979,15 +1109,18 @@ def process(self):
             self._save_train_subset()
 
 
-
-
 class POJ104Dataset(InMemoryDataset):
-    def __init__(self, root='deeplearning/ml4pl/poj104/classifyapp_data',
-                 split='fail',
-                 transform=None, pre_transform=None,
-                 train_subset=[0, 100], train_subset_seed=0,
-                 cdfg: bool = False,
-                 ablation_vocab: AblationVocab = AblationVocab.NONE):
+    def __init__(
+        self,
+        root="deeplearning/ml4pl/poj104/classifyapp_data",
+        split="fail",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+        cdfg: bool = False,
+        ablation_vocab: AblationVocab = AblationVocab.NONE,
+    ):
         """
         Args:
             train_subset: [start_percentile, stop_percentile)    default [0,100).
@@ -1002,22 +1135,24 @@ def __init__(self, root='deeplearning/ml4pl/poj104/classifyapp_data',
         self.ablation_vocab = ablation_vocab
         super().__init__(root, transform, pre_transform)
 
-        assert split in ['train', 'val', 'test']
+        assert split in ["train", "val", "test"]
         self.data, self.slices = torch.load(self.processed_paths[0])
 
     @property
     def raw_file_names(self):
-        return 'classifyapp_data.zip' #['ir_val', 'ir_val_programl']
+        return "classifyapp_data.zip"  # ['ir_val', 'ir_val_programl']
 
     @property
     def processed_file_names(self):
         base = filename(self.split, self.cdfg, self.ablation_vocab)
 
-        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+        if tuple(self.train_subset) == (0, 100) or self.split in ["val", "test"]:
             return [base]
         else:
-            assert self.split == 'train'
-            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            assert self.split == "train"
+            return [
+                f"{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         # download to self.raw_dir
@@ -1029,13 +1164,14 @@ def _save_train_subset(self):
         with a fixed random permutation with self.train_subset_seed.
         """
         import numpy as np
+
         perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
 
         # take slice of perm according to self.train_subset
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(100, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -1048,20 +1184,26 @@ def process(self):
         num_classes = 104
 
         # check if we need to create the full dataset:
-        full_dataset = Path(self.processed_dir) / filename(self.split, self.cdfg, self.ablation_vocab)
+        full_dataset = Path(self.processed_dir) / filename(
+            self.split, self.cdfg, self.ablation_vocab
+        )
         if full_dataset.is_file():
-            assert self.split == 'train', 'here shouldnt be reachable.'
-            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            assert self.split == "train", "here shouldnt be reachable."
+            print(
+                f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
-        
+
         # get vocab first
         vocab = load_vocabulary(CDFG_VOCABULARY if self.cdfg else PROGRAML_VOCABULARY)
         assert len(vocab) > 0, "vocab is empty :|"
@@ -1069,36 +1211,42 @@ def process(self):
         data_list = []
 
         ds_base = Path(self.root)
-        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        print(f"Creating {self.split} dataset at {str(ds_base)}")
 
         split_folder = ds_base / (self.split)
         assert split_folder.exists(), f"{split_folder} doesn't exist!"
-        
+
         # collect .pb and call nx2data on the fly!
-        print(f"=== DATASET {split_folder}: Collecting ProgramGraph.pb files into dataset")
+        print(
+            f"=== DATASET {split_folder}: Collecting ProgramGraph.pb files into dataset"
+        )
 
         # only take files from subfolders (with class names!) recursively
         files = [x for x in split_folder.rglob("*ProgramGraph.pb")]
         assert len(files) > 0, "no files collected. error."
         for file in tqdm.tqdm(files):
             # skip classes that are larger than what config says to enable debugging with less data
-            #class_label = int(file.parent.name) - 1  # let classes start from 0.
-            #if class_label >= num_classes:
+            # class_label = int(file.parent.name) - 1  # let classes start from 0.
+            # if class_label >= num_classes:
             #    continue
 
             g, orig_graph = load(file, cdfg=self.cdfg)
-            data = nx2data(graph=g,
-                           vocabulary=vocab,
-                           ablate_vocab=self.ablation_vocab,
-                           y_feature_name="poj104_label",
-                           orig_graph=orig_graph)
+            data = nx2data(
+                graph=g,
+                vocabulary=vocab,
+                ablate_vocab=self.ablation_vocab,
+                y_feature_name="poj104_label",
+                orig_graph=orig_graph,
+            )
             data_list.append(data)
 
         print(f" * COMPLETED * === DATASET {split_folder}: now pre-filtering...")
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET {split_folder}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET {split_folder}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -1107,17 +1255,23 @@ def process(self):
         torch.save((self.data, self.slices), processed_path)
 
         # maybe save train_subset as well
-        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+        if not tuple(self.train_subset) == (0, 100) and self.split not in [
+            "val",
+            "test",
+        ]:
             self._save_train_subset()
 
 
-
-
 class LegacyPOJ104Dataset(InMemoryDataset):
-    def __init__(self, root='deeplearning/ml4pl/poj104/classifyapp_data',
-                 split='fail',
-                 transform=None, pre_transform=None,
-                 train_subset=[0, 100], train_subset_seed=0):
+    def __init__(
+        self,
+        root="deeplearning/ml4pl/poj104/classifyapp_data",
+        split="fail",
+        transform=None,
+        pre_transform=None,
+        train_subset=[0, 100],
+        train_subset_seed=0,
+    ):
         """
         Args:
             train_subset: [start_percentile, stop_percentile)    default [0,100).
@@ -1130,22 +1284,24 @@ def __init__(self, root='deeplearning/ml4pl/poj104/classifyapp_data',
         self.train_subset_seed = train_subset_seed
         super().__init__(root, transform, pre_transform)
 
-        assert split in ['train', 'val', 'test']
+        assert split in ["train", "val", "test"]
         self.data, self.slices = torch.load(self.processed_paths[0])
 
     @property
     def raw_file_names(self):
-        return 'classifyapp_data.zip' #['ir_val', 'ir_val_programl']
+        return "classifyapp_data.zip"  # ['ir_val', 'ir_val_programl']
 
     @property
     def processed_file_names(self):
-        base = f'{self.split}_data.pt'
+        base = f"{self.split}_data.pt"
 
-        if tuple(self.train_subset) == (0, 100) or self.split in ['val', 'test']:
+        if tuple(self.train_subset) == (0, 100) or self.split in ["val", "test"]:
             return [base]
         else:
-            assert self.split == 'train'
-            return [f'{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt']
+            assert self.split == "train"
+            return [
+                f"{self.split}_data_subset_{self.train_subset[0]}_{self.train_subset[1]}_seed_{self.train_subset_seed}.pt"
+            ]
 
     def download(self):
         # download to self.raw_dir
@@ -1157,13 +1313,14 @@ def _save_train_subset(self):
         with a fixed random permutation with self.train_subset_seed.
         """
         import numpy as np
+
         perm = np.random.RandomState(self.train_subset_seed).permutation(len(self))
 
         # take slice of perm according to self.train_subset
         start = np.math.floor(len(self) / 100 * self.train_subset[0])
         stop = np.math.floor(len(self) / 100 * self.train_subset[1])
         perm = perm[start:stop]
-        print(f'Fixed permutation starts with: {perm[:min(100, len(perm))]}')
+        print(f"Fixed permutation starts with: {perm[:min(100, len(perm))]}")
 
         dataset = self.__indexing__(perm)
 
@@ -1176,32 +1333,40 @@ def process(self):
         num_classes = 104
 
         # check if we need to create the full dataset:
-        full_dataset = Path(self.processed_dir) / f'{self.split}_data.pt'
+        full_dataset = Path(self.processed_dir) / f"{self.split}_data.pt"
         if full_dataset.is_file():
-            assert self.split == 'train', 'here shouldnt be reachable.'
-            print(f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}")
+            assert self.split == "train", "here shouldnt be reachable."
+            print(
+                f"Full dataset found. Generating train_subset={self.train_subset} with seed={self.train_subset_seed}"
+            )
             # just get the split and save it
             self.data, self.slices = torch.load(full_dataset)
             self._save_train_subset()
-            print(f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk.")
+            print(
+                f"Saved train_subset={self.train_subset} with seed={self.train_subset_seed} to disk."
+            )
             return
 
         # ~~~~~ we need to create the full dataset ~~~~~~~~~~~
-        assert not full_dataset.is_file(), 'shouldnt be'
+        assert not full_dataset.is_file(), "shouldnt be"
         processed_path = str(full_dataset)
 
         # read data into huge `Data` list.
         data_list = []
 
         ds_base = Path(self.root)
-        print(f'Creating {self.split} dataset at {str(ds_base)}')
+        print(f"Creating {self.split} dataset at {str(ds_base)}")
         # TODO change this line to go to the new format
-        out_base = ds_base / ('ir_' + self.split + '_programl')
+        out_base = ds_base / ("ir_" + self.split + "_programl")
         assert out_base.exists(), f"{out_base} doesn't exist!"
         # TODO collect .ll.pickle instead and call nx2data on the fly!
         print(f"=== DATASET {out_base}: Collecting .data.p files into dataset")
 
-        folders = [x for x in out_base.glob("*") if x.is_dir() and x.name not in ['_nx', '_tuples']]
+        folders = [
+            x
+            for x in out_base.glob("*")
+            if x.is_dir() and x.name not in ["_nx", "_tuples"]
+        ]
         for folder in tqdm.tqdm(folders):
             # skip classes that are larger than what config says to enable debugging with less data
             if int(folder.name) > num_classes:
@@ -1215,7 +1380,9 @@ def process(self):
 
         if self.pre_filter is not None:
             data_list = [d for d in data_list if self.pre_filter(d)]
-        print(f" * COMPLETED * === DATASET {out_base}: Completed filtering, now pre_transforming...")
+        print(
+            f" * COMPLETED * === DATASET {out_base}: Completed filtering, now pre_transforming..."
+        )
 
         if self.pre_transform is not None:
             data_list = [self.pre_transform(d) for d in data_list]
@@ -1224,15 +1391,18 @@ def process(self):
         torch.save((self.data, self.slices), processed_path)
 
         # maybe save train_subset as well
-        if not tuple(self.train_subset) == (0, 100) and self.split not in ['val', 'test']:
+        if not tuple(self.train_subset) == (0, 100) and self.split not in [
+            "val",
+            "test",
+        ]:
             self._save_train_subset()
 
 
-if __name__ == '__main__':
-    #d = NewNCCDataset()
-    #print(d.data)
-    root = '/home/zacharias/llvm_datasets/threadcoarsening_data/'
-    a = ThreadcoarseningDataset(root, 'Cypress')
-    b = ThreadcoarseningDataset(root, 'Tahiti')
-    c = ThreadcoarseningDataset(root, 'Fermi')
-    d = ThreadcoarseningDataset(root, 'Kepler')
+if __name__ == "__main__":
+    # d = NewNCCDataset()
+    # print(d.data)
+    root = "/home/zacharias/llvm_datasets/threadcoarsening_data/"
+    a = ThreadcoarseningDataset(root, "Cypress")
+    b = ThreadcoarseningDataset(root, "Tahiti")
+    c = ThreadcoarseningDataset(root, "Fermi")
+    d = ThreadcoarseningDataset(root, "Kepler")
diff --git a/programl/task/graph_level_classification/modeling.py b/programl/task/graph_level_classification/modeling.py
index 3bc94e79d..af076703f 100644
--- a/programl/task/graph_level_classification/modeling.py
+++ b/programl/task/graph_level_classification/modeling.py
@@ -15,30 +15,34 @@
 # limitations under the License.
 """Modules that make up the pytorch GNN models."""
 import math
+
 import torch
 import torch.nn.functional as F
-from torch import nn
-from torch import optim
+from torch import nn, optim
 
 # Dependency moved into SelfAttention Message Layer
 from torch_geometric.utils import softmax as scatter_softmax
 
-
 SMALL_NUMBER = 1e-8
 
+
 def print_state_dict(mod):
     for n, t in mod.state_dict().items():
         print(n, t.size())
 
+
 def num_parameters(mod) -> int:
     """Compute the number of trainable parameters in a nn.Module and its children.
     OBS:
         This function misses some parameters, i.e. in pytorch's official MultiheadAttention layer,
         while the state dict doesn't miss any!
     """
-    num_params = sum(param.numel() for param in mod.parameters(recurse=True) if param.requires_grad)
+    num_params = sum(
+        param.numel() for param in mod.parameters(recurse=True) if param.requires_grad
+    )
     return f"{num_params:,} params, weights size: {num_params * 4 / 1e6:.3f}MB."
 
+
 def assert_no_nan(tensor_list):
     for i, t in enumerate(tensor_list):
         assert not torch.isnan(t).any(), f"{i}: {tensor_list}"
@@ -54,7 +58,9 @@ def __init__(self):
     def setup(self, config, test_only):
         self.loss = Loss(config)
         # move model to device before making optimizer!
-        self.dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        )
 
         self.to(self.dev)
         print(f"Moved model to {self.dev}")
@@ -70,13 +76,28 @@ def get_optimizer(self, config):
 
     def num_parameters(self) -> int:
         """Compute the number of trainable parameters in this nn.Module and its children."""
-        num_params = sum(param.numel() for param in self.parameters(recurse=True) if param.requires_grad)
+        num_params = sum(
+            param.numel()
+            for param in self.parameters(recurse=True)
+            if param.requires_grad
+        )
         return f"{num_params:,} params, weights size: ~{num_params * 4 // 1e6:,}MB."
 
-    def forward(self, vocab_ids, labels, edge_lists, selector_ids=None,
-                pos_lists=None, num_graphs=None, graph_nodes_list=None,
-                node_types=None, aux_in=None, test_time_steps=None, readout_mask=None, runtimes=None,
-        ):
+    def forward(
+        self,
+        vocab_ids,
+        labels,
+        edge_lists,
+        selector_ids=None,
+        pos_lists=None,
+        num_graphs=None,
+        graph_nodes_list=None,
+        node_types=None,
+        aux_in=None,
+        test_time_steps=None,
+        readout_mask=None,
+        runtimes=None,
+    ):
         # Input
         # selector_ids are ignored anyway by the NodeEmbeddings module that doesn't support them.
         raw_in = self.node_embeddings(vocab_ids, selector_ids)
@@ -88,8 +109,10 @@ def forward(self, vocab_ids, labels, edge_lists, selector_ids=None,
         # instead!
 
         # Readout
-        if getattr(self.config, 'has_graph_labels', False):
-            assert graph_nodes_list is not None and num_graphs is not None, 'has_graph_labels requires graph_nodes_list and num_graphs tensors.'
+        if getattr(self.config, "has_graph_labels", False):
+            assert (
+                graph_nodes_list is not None and num_graphs is not None
+            ), "has_graph_labels requires graph_nodes_list and num_graphs tensors."
             nodewise_readout, graphwise_readout = self.readout(
                 raw_in,
                 raw_out,
@@ -100,60 +123,64 @@ def forward(self, vocab_ids, labels, edge_lists, selector_ids=None,
             )
             logits = graphwise_readout
         else:  # nodewise only
-            nodewise_readout, _ = self.readout(raw_in, raw_out, readout_mask=readout_mask)
+            nodewise_readout, _ = self.readout(
+                raw_in, raw_out, readout_mask=readout_mask
+            )
             graphwise_readout = None
             logits = nodewise_readout
 
         # do the old style aux_readout if not aux_use_better is set
-        if getattr(self.config, 'has_aux_input', False) and not getattr(self.config, 'aux_use_better', False):
-            assert self.config.has_graph_labels is True, \
-                "Implementation hasn't been checked for use with aux_input and nodewise prediction! It could work or fail silently."
+        if getattr(self.config, "has_aux_input", False) and not getattr(
+            self.config, "aux_use_better", False
+        ):
+            assert (
+                self.config.has_graph_labels is True
+            ), "Implementation hasn't been checked for use with aux_input and nodewise prediction! It could work or fail silently."
             assert aux_in is not None
             logits, graphwise_readout = self.aux_readout(logits, aux_in)
 
-
         if readout_mask is not None:  # need to mask labels in the same fashion.
-            assert readout_mask.dtype == torch.bool, 'Readout mask should be boolean!'
+            assert readout_mask.dtype == torch.bool, "Readout mask should be boolean!"
             labels = labels[readout_mask]
 
         # Metrics
         # accuracy, correct?, targets, maybe runtimes: actual, optimal
         metrics_tuple = self.metrics(logits, labels, runtimes)
 
-        outputs = (
-            (logits,) + metrics_tuple + (graphwise_readout,) + tuple(unroll_stats)
-        )
+        outputs = (logits,) + metrics_tuple + (graphwise_readout,) + tuple(unroll_stats)
 
         return outputs
 
 
 class GraphTransformerModel(BaseGNNModel):
     """Transformer Encoder for Graphs."""
+
     def __init__(self, config, pretrained_embeddings=None, test_only=False):
         super().__init__()
         self.config = config
         self.node_embeddings = NodeEmbeddings(config)
         self.gnn = GraphTransformerEncoder(config)
 
-
         # get readout and maybe tack on the aux readout
         self.has_aux_input = getattr(self.config, "has_aux_input", False)
-        self.aux_use_better = getattr(self.config, 'aux_use_better', False)
-        
+        self.aux_use_better = getattr(self.config, "aux_use_better", False)
+
         if self.has_aux_input and self.aux_use_better:
             self.readout = BetterAuxiliaryReadout(config)
         elif self.has_aux_input:
             self.readout = Readout(config)
             self.aux_readout = AuxiliaryReadout(config)
         else:
-            assert not self.aux_use_better, 'aux_use_better only with has_aux_input!'
+            assert not self.aux_use_better, "aux_use_better only with has_aux_input!"
             self.readout = Readout(config)
 
         self.metrics = Metrics()
 
         self.setup(config, test_only)
         print(self)
-        print(f"Number of trainable params in GraphTransformerModel: {self.num_parameters()}")
+        print(
+            f"Number of trainable params in GraphTransformerModel: {self.num_parameters()}"
+        )
 
 
 class GGNNModel(BaseGNNModel):
@@ -162,23 +189,24 @@ def __init__(self, config, pretrained_embeddings=None, test_only=False):
         self.config = config
 
         # input layer
-        if getattr(config, 'use_selector_embeddings', False):
-            self.node_embeddings = NodeEmbeddingsWithSelectors(config, pretrained_embeddings)
+        if getattr(config, "use_selector_embeddings", False):
+            self.node_embeddings = NodeEmbeddingsWithSelectors(
+                config, pretrained_embeddings
+            )
         else:
             self.node_embeddings = NodeEmbeddings(config, pretrained_embeddings)
 
-
         # Readout layer
         # get readout and maybe tack on the aux readout
         self.has_aux_input = getattr(self.config, "has_aux_input", False)
-        self.aux_use_better = getattr(self.config, 'aux_use_better', False)
+        self.aux_use_better = getattr(self.config, "aux_use_better", False)
         if self.has_aux_input and self.aux_use_better:
             self.readout = BetterAuxiliaryReadout(config)
         elif self.has_aux_input:
             self.readout = Readout(config)
             self.aux_readout = AuxiliaryReadout(config)
         else:
-            assert not self.aux_use_better, 'aux_use_better only with has_aux_input!'
+            assert not self.aux_use_better, "aux_use_better only with has_aux_input!"
             self.readout = Readout(config)
 
         # GNN
@@ -192,6 +220,7 @@ def __init__(self, config, pretrained_embeddings=None, test_only=False):
         print(self)
         print(f"Number of trainable params in GGNNModel: {self.num_parameters()}")
 
+
 ################################################
 # GNN Encoder: Message+Aggregate, Update
 ################################################
@@ -210,37 +239,52 @@ def __init__(self, config, readout=None):
         self.update_weight_sharing = config.update_weight_sharing
         message_layers = self.gnn_layers // self.message_weight_sharing
         update_layers = self.gnn_layers // self.update_weight_sharing
-        assert message_layers * self.message_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
-        assert update_layers * self.update_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
-        #self.layer_timesteps = config.layer_timesteps
+        assert (
+            message_layers * self.message_weight_sharing == self.gnn_layers
+        ), "layer number and reuse mismatch."
+        assert (
+            update_layers * self.update_weight_sharing == self.gnn_layers
+        ), "layer number and reuse mismatch."
+        # self.layer_timesteps = config.layer_timesteps
 
         self.position_embeddings = config.position_embeddings
 
         # optional eval time unrolling parameter
-        self.test_layer_timesteps = getattr(config, 'test_layer_timesteps', 0)
-        self.unroll_strategy = getattr(config, 'unroll_strategy', 'none')
-        self.max_timesteps = getattr(config, 'max_timesteps', 1000)
-        self.label_conv_threshold = getattr(config, 'label_conv_threshold', 0.995)
-        self.label_conv_stable_steps = getattr(config, 'label_conv_stable_steps', 1)
+        self.test_layer_timesteps = getattr(config, "test_layer_timesteps", 0)
+        self.unroll_strategy = getattr(config, "unroll_strategy", "none")
+        self.max_timesteps = getattr(config, "max_timesteps", 1000)
+        self.label_conv_threshold = getattr(config, "label_conv_threshold", 0.995)
+        self.label_conv_stable_steps = getattr(config, "label_conv_stable_steps", 1)
 
         # make readout avalable for label_convergence tests
         if self.unroll_strategy == "label_convergence":
-            assert not self.config.has_aux_input, "aux_input is not supported with label_convergence"
-            assert readout, "Gotta pass instantiated readout module for label_convergence tests!"
+            assert (
+                not self.config.has_aux_input
+            ), "aux_input is not supported with label_convergence"
+            assert (
+                readout
+            ), "Gotta pass instantiated readout module for label_convergence tests!"
             self.readout = readout
 
         # Message and update layers
         self.message = nn.ModuleList()
-        #for i in range(len(self.layer_timesteps)):§
+        # for i in range(len(self.layer_timesteps)):§
         for i in range(message_layers):
             self.message.append(GGNNMessageLayer(config))
 
         self.update = nn.ModuleList()
-        #for i in range(len(self.layer_timesteps)):
+        # for i in range(len(self.layer_timesteps)):
         for i in range(update_layers):
             self.update.append(GGNNUpdateLayer(config))
 
-    def forward(self, edge_lists, node_states, pos_lists=None, node_types=None, test_time_steps=None):
+    def forward(
+        self,
+        edge_lists,
+        node_states,
+        pos_lists=None,
+        node_types=None,
+        test_time_steps=None,
+    ):
         old_node_states = node_states.clone()
 
         if self.backward_edges:
@@ -253,34 +297,36 @@ def forward(self, edge_lists, node_states, pos_lists=None, node_types=None, test
 
         # we allow for some fancy unrolling strategies.
         # Currently only at eval time, but there is really no good reason for this.
-        assert self.unroll_strategy == 'none', 'New layer_timesteps not implemented for this unroll_strategy.'
-        #if self.training or self.unroll_strategy == "none":
+        assert (
+            self.unroll_strategy == "none"
+        ), "New layer_timesteps not implemented for this unroll_strategy."
+        # if self.training or self.unroll_strategy == "none":
         #    #layer_timesteps =
         #    #layer_timesteps = self.layer_timesteps
-        #elif self.unroll_strategy == "constant":
+        # elif self.unroll_strategy == "constant":
         #    layer_timesteps = self.test_layer_timesteps
-        #elif self.unroll_strategy == "edge_count":
+        # elif self.unroll_strategy == "edge_count":
         #    assert (
         #        test_time_steps is not None
         #    ), f"You need to pass test_time_steps or not use unroll_strategy '{self.unroll_strategy}''"
         #    layer_timesteps = [min(test_time_steps, self.max_timesteps)]
-        #elif self.unroll_strategy == "data_flow_max_steps":
+        # elif self.unroll_strategy == "data_flow_max_steps":
         #    assert (
         #        test_time_steps is not None
         #    ), f"You need to pass test_time_steps or not use unroll_strategy '{self.unroll_strategy}''"
         #    layer_timesteps = [min(test_time_steps, self.max_timesteps)]
-        #elif self.unroll_strategy == "label_convergence":
+        # elif self.unroll_strategy == "label_convergence":
         #    node_states, unroll_steps, converged = self.label_convergence_forward(
         #        edge_lists, node_states, pos_lists, node_types, initial_node_states=old_node_states
         #    )
         #    return node_states, old_node_states, unroll_steps, converged
-        #else:
+        # else:
         #    raise TypeError(
         #        "Unreachable! "
         #        f"Unroll strategy: {self.unroll_strategy}, training: {self.training}"
         #    )
 
-        #for (layer_idx, num_timesteps) in enumerate(layer_timesteps):
+        # for (layer_idx, num_timesteps) in enumerate(layer_timesteps):
         #    for t in range(num_timesteps):
         #        messages = self.message[layer_idx](edge_lists, node_states, pos_lists)
         #        node_states = self.update[layer_idx](messages, node_states, node_types)
@@ -300,9 +346,7 @@ def label_convergence_forward(
         ), f"Label convergence only supports one-layer GGNNs, but {len(self.layer_timesteps)} are configured in layer_timesteps: {self.layer_timesteps}"
 
         stable_steps, i = 0, 0
-        old_tentative_labels = self.tentative_labels(
-            initial_node_states, node_states
-        )
+        old_tentative_labels = self.tentative_labels(initial_node_states, node_states)
 
         while True:
             messages = self.message[0](edge_lists, node_states, pos_lists)
@@ -349,58 +393,73 @@ def __init__(self, config, readout=None):
         self.update_weight_sharing = config.update_weight_sharing
         message_layers = self.gnn_layers // self.message_weight_sharing
         update_layers = self.gnn_layers // self.update_weight_sharing
-        assert message_layers * self.message_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
-        assert update_layers * self.update_weight_sharing == self.gnn_layers, "layer number and reuse mismatch."
-        #self.layer_timesteps = config.layer_timesteps
+        assert (
+            message_layers * self.message_weight_sharing == self.gnn_layers
+        ), "layer number and reuse mismatch."
+        assert (
+            update_layers * self.update_weight_sharing == self.gnn_layers
+        ), "layer number and reuse mismatch."
+        # self.layer_timesteps = config.layer_timesteps
 
-        self.use_node_types = getattr(config, 'use_node_types', False)
+        self.use_node_types = getattr(config, "use_node_types", False)
         assert not self.use_node_types, "not implemented"
 
         # Position Embeddings
-        if getattr(config, 'position_embeddings', False):
-            self.selector_size = getattr(config, 'selector_size', 0)
+        if getattr(config, "position_embeddings", False):
+            self.selector_size = getattr(config, "selector_size", 0)
             self.emb_size = config.emb_size
             # we are going to lookup the pos embs only once per batch instead of within every message layer
             self.position_embs = PositionEmbeddings()
-            #self.register_buffer("position_embs",
+            # self.register_buffer("position_embs",
             #    PositionEmbeddings()(
             #        torch.arange(512, dtype=torch.get_default_dtype()),
             #        config.emb_size,
             #        dpad=getattr(config, 'selector_size', 0),
             #    ),
-            #)
+            # )
         else:
             self.position_embs = None
 
         # Message and update layers
         self.message = nn.ModuleList()
-        #for i in range(len(self.layer_timesteps)):
+        # for i in range(len(self.layer_timesteps)):
         for i in range(message_layers):
             self.message.append(TypedSelfAttentionMessageLayer(config))
 
-        update_layer = getattr(config, 'update_layer', 'ff')
-        if update_layer == 'ff':
+        update_layer = getattr(config, "update_layer", "ff")
+        if update_layer == "ff":
             UpdateLayer = TransformerUpdateLayer
-        elif update_layer == 'gru':
+        elif update_layer == "gru":
             UpdateLayer = GGNNUpdateLayer
         else:
             raise ValueError("config.update_layer has to be 'gru' or 'ff'!")
 
         self.update = nn.ModuleList()
-        #for i in range(len(self.layer_timesteps)):
+        # for i in range(len(self.layer_timesteps)):
         for i in range(update_layers):
             self.update.append(UpdateLayer(config))
 
-    def forward(self, edge_lists, node_states, pos_lists=None, node_types=None, test_time_steps=None):
+    def forward(
+        self,
+        edge_lists,
+        node_states,
+        pos_lists=None,
+        node_types=None,
+        test_time_steps=None,
+    ):
         old_node_states = node_states.clone()
 
         # gather position embeddings for each edge
         pos_emb_lists = None
-        if getattr(self, 'position_embs') is not None:
+        if getattr(self, "position_embs") is not None:
             pos_emb_lists = []
             for i, pl in enumerate(pos_lists):
                 # p_emb = torch.index_select(self.position_embs, dim=0, index=pl)
-                p_emb = self.position_embs(pl.to(dtype=torch.get_default_dtype()), self.emb_size, dpad=self.selector_size)
+                p_emb = self.position_embs(
+                    pl.to(dtype=torch.get_default_dtype()),
+                    self.emb_size,
+                    dpad=self.selector_size,
+                )
                 pos_emb_lists.append(p_emb)
 
         # Prepare for backward edges
@@ -424,6 +483,7 @@ def forward(self, edge_lists, node_states, pos_lists=None, node_types=None, test
 
 ###### Message Layers
 
+
 class SelfAttentionMessageLayer(nn.Module):
     """Implements transformer scaled dot-product self-attention, cf. Vaswani et al. 2017,
     in a sparse setting on a graph. This reduces the time and space complexity
@@ -456,15 +516,26 @@ def __init__(self, config):
         self.dropout_p = config.attn_dropout
 
         head_dim = self.embed_dim // self.num_heads
-        assert head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            head_dim * self.num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
 
         # projection from input to q, k, v
         # Myle Ott et al. apparently observed that initializing the qkv_projection (in one matrix) with xavier_uni and gain 1/sqrt(2) to be much better than 1.
-        self.qkv_in_proj = LinearNet(self.embed_dim, self.embed_dim * 3, bias=self.bias, gain=1 / math.sqrt(2))
+        self.qkv_in_proj = LinearNet(
+            self.embed_dim, self.embed_dim * 3, bias=self.bias, gain=1 / math.sqrt(2)
+        )
         self.out_proj = LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
         self.dropout = nn.Dropout(p=self.dropout_p, inplace=True)
 
-    def forward(self, edge_lists=None, node_states=None, pos_lists=None, edges=None, need_weights=False):
+    def forward(
+        self,
+        edge_lists=None,
+        node_states=None,
+        pos_lists=None,
+        edges=None,
+        need_weights=False,
+    ):
         """NB: pos_lists are ignored."""
 
         # Glue Code:
@@ -479,7 +550,6 @@ def forward(self, edge_lists=None, node_states=None, pos_lists=None, edges=None,
             edge_sources = edges[0, :]
             edge_targets = edges[1, :]
 
-
         # ~~~ Sparse Self-Attention ~~~
         # The implementation follows the official pytorch implementation, but sparse.
         # Legend:
@@ -491,7 +561,9 @@ def forward(self, edge_lists=None, node_states=None, pos_lists=None, edges=None,
         assert embed_dim == self.embed_dim
 
         head_dim = embed_dim // self.num_heads
-        assert head_dim * self.num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            head_dim * self.num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
 
         # 1) get Q, K, V from node_states
         #   (needs to be merged with step 2 if we want to use positions..., bc
@@ -499,7 +571,6 @@ def forward(self, edge_lists=None, node_states=None, pos_lists=None, edges=None,
 
         q, k, v = self.qkv_in_proj(node_states).chunk(3, dim=1)
 
-
         # 2) get Q', K', V' \in <M, D> by doing an F.emb lookup on Q, K, V (maybe transposed)
         #   according to index
         #       edge_target for Q, and
@@ -510,14 +581,15 @@ def forward(self, edge_lists=None, node_states=None, pos_lists=None, edges=None,
         v_prime = torch.index_select(v, dim=0, index=edge_sources)
 
         messages, attn_weights = self.sparse_attn_forward(
-                                        q_prime, k_prime, v_prime,
-                                        num_nodes, edge_targets, need_weights
-                                    )
+            q_prime, k_prime, v_prime, num_nodes, edge_targets, need_weights
+        )
         if need_weights:
             return messages, attn_weights
         return messages
 
-    def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets, need_weights):
+    def sparse_attn_forward(
+        self, q_prime, k_prime, v_prime, num_nodes, edge_targets, need_weights
+    ):
         """Differently to dense self-attention, we expect q', k', v',
         which are the query, key and value projected node_states [+pos embs]
         index_selected by edge_targets, edge_sources and edge_source.
@@ -533,10 +605,15 @@ def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets
         head_dim = embed_dim // self.num_heads
 
         # some checks
-        assert head_dim * self.num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        assert q_prime.size() == k_prime.size(), \
-            f"q_prime, k_prime size mismatch: {q_prime.size()}, {k_prime.size()}"
-        assert q_prime.size()[0] == v_prime.size()[0], 'number of queries and values mismatch'
+        assert (
+            head_dim * self.num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        assert (
+            q_prime.size() == k_prime.size()
+        ), f"q_prime, k_prime size mismatch: {q_prime.size()}, {k_prime.size()}"
+        assert (
+            q_prime.size()[0] == v_prime.size()[0]
+        ), "number of queries and values mismatch"
 
         # ~~~ Sparse Self-Attention ~~~
         # The implementation follows the official pytorch implementation, but sparse.
@@ -552,10 +629,16 @@ def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets
         #   We will end up with <M, h> unnormalized attention scores.
         scores_prime = q_prime * k_prime
         # sum segments of head_dim size into num_head chunks
-        scores = scores_prime.transpose(0,1).view(self.num_heads, head_dim, -1).sum(dim=1).t().contiguous()
+        scores = (
+            scores_prime.transpose(0, 1)
+            .view(self.num_heads, head_dim, -1)
+            .sum(dim=1)
+            .t()
+            .contiguous()
+        )
         scaling = float(head_dim) ** -0.5
         scores = scores * scaling
-        assert scores.size() == (q_prime.size()[0], self.num_heads) # <M, num_heads>
+        assert scores.size() == (q_prime.size()[0], self.num_heads)  # <M, num_heads>
 
         # 4) Scattered Softmax:
         #   Perform a softmax by normalizing scores with the sum of those scores
@@ -564,7 +647,9 @@ def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets
         # 4*) multi-head: here we run the scattered_softmax in parallel over the h dimensions independently.
 
         # <M, num_heads>
-        attn_output_weights = scatter_softmax(scores, index=edge_targets, num_nodes=num_nodes)  # noqa: F821
+        attn_output_weights = scatter_softmax(
+            scores, index=edge_targets, num_nodes=num_nodes
+        )  # noqa: F821
         attn_output_weights = self.dropout(attn_output_weights)
 
         # 5) V' * %4: weight values V' <M> by attention.
@@ -573,7 +658,7 @@ def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets
         #   then get back the old view
         v_prime = v_prime.transpose(0, 1)
         v_prime = v_prime.view(self.num_heads, head_dim, -1)
-        v_prime = v_prime.permute(2,0,1) # v_prime now: <M, num_heads, head_dim>
+        v_prime = v_prime.permute(2, 0, 1)  # v_prime now: <M, num_heads, head_dim>
 
         attn_out_per_edge = v_prime * attn_output_weights.unsqueeze(2)
         attn_out_per_edge = attn_out_per_edge.view(-1, embed_dim)
@@ -581,7 +666,9 @@ def sparse_attn_forward(self, q_prime, k_prime, v_prime, num_nodes, edge_targets
         # 6) Scatter Add: aggregate messages via index_add with index edge_target
         # to end up with
         #       messages <N, D>
-        attn_out = torch.zeros(num_nodes, embed_dim, dtype=torch.get_default_dtype(), device=q_prime.device)
+        attn_out = torch.zeros(
+            num_nodes, embed_dim, dtype=torch.get_default_dtype(), device=q_prime.device
+        )
         attn_out.index_add_(0, edge_targets, attn_out_per_edge)
 
         # 5* b) Additionally project from the concatenation back to D. cf. vaswani et al. 2017
@@ -622,44 +709,56 @@ def __init__(self, config):
         # init as a module without running parent __init__.
         nn.Module.__init__(self)
 
-        self.edge_type_count = config.edge_type_count * 2 if config.backward_edges else config.edge_type_count
+        self.edge_type_count = (
+            config.edge_type_count * 2
+            if config.backward_edges
+            else config.edge_type_count
+        )
         self.embed_dim = config.hidden_size
         self.bias = config.attn_bias
         self.num_heads = config.attn_num_heads
         self.dropout_p = config.attn_dropout
 
         head_dim = self.embed_dim // self.num_heads
-        assert head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            head_dim * self.num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
 
-        self.position_embs = getattr(config, 'position_embeddings', False)
-        self.attn_v_pos = getattr(config, 'attn_v_pos', False)
+        self.position_embs = getattr(config, "position_embeddings", False)
+        self.attn_v_pos = getattr(config, "attn_v_pos", False)
         if not self.position_embs:
-            assert not self.attn_v_pos, "Use position_embeddings if you want attn_v_pos!"
+            assert (
+                not self.attn_v_pos
+            ), "Use position_embeddings if you want attn_v_pos!"
 
         # projection from input to q, k, v
         # Myle Ott et al. apparently observed that initializing the qkv_projection (in one matrix)
         #   with
         #        nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
         #   to be much better than only xavier.
-        #self.qkv_in_proj = LinearNet(self.embed_dim, self.embed_dim * 3, bias=self.bias, gain=1 / math.sqrt(2))
+        # self.qkv_in_proj = LinearNet(self.embed_dim, self.embed_dim * 3, bias=self.bias, gain=1 / math.sqrt(2))
 
         # in projection per edge type.
         self.q_proj = LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
         self.k_proj = nn.ModuleList()
         self.v_proj = nn.ModuleList()
         for i in range(self.edge_type_count):
-            self.k_proj.append(LinearNet(self.embed_dim, self.embed_dim, bias=self.bias))
-            self.v_proj.append(LinearNet(self.embed_dim, self.embed_dim, bias=self.bias))
+            self.k_proj.append(
+                LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
+            )
+            self.v_proj.append(
+                LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
+            )
 
         self.out_proj = LinearNet(self.embed_dim, self.embed_dim, bias=self.bias)
         self.dropout = nn.Dropout(p=self.dropout_p, inplace=True)
 
     def forward(self, edge_lists, node_states, pos_lists=None, need_weights=False):
         """Args:
-            edge_lists:     list of edge_index tensors of size <M_i, 2>
-            node_states:    <N, emb_sz>
-            pos_lists:      OBS: We expect these to be pos_emb_lists each of size <M_i, emb_sz>
-            need_weights:   optionally return avg attention weights per edge of size <M>
+        edge_lists:     list of edge_index tensors of size <M_i, 2>
+        node_states:    <N, emb_sz>
+        pos_lists:      OBS: We expect these to be pos_emb_lists each of size <M_i, emb_sz>
+        need_weights:   optionally return avg attention weights per edge of size <M>
         """
         assert len(edge_lists) == self.edge_type_count
 
@@ -667,7 +766,9 @@ def forward(self, edge_lists, node_states, pos_lists=None, need_weights=False):
         assert embed_dim == self.embed_dim
 
         head_dim = embed_dim // self.num_heads
-        assert head_dim * self.num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            head_dim * self.num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
 
         # 1) get Q', K', V' \in <M, D>  from node_states
         # by index_select according to index
@@ -713,8 +814,9 @@ def forward(self, edge_lists, node_states, pos_lists=None, need_weights=False):
         v_prime = torch.cat(v_primes, dim=0)
 
         # ~~~~ From here, we are back in the general sparse self-attention setting ~~~~~
-        messages, attn_weights = self.sparse_attn_forward(q_prime, k_prime, v_prime,
-                                                     num_nodes, edge_targets, need_weights)
+        messages, attn_weights = self.sparse_attn_forward(
+            q_prime, k_prime, v_prime, num_nodes, edge_targets, need_weights
+        )
         if need_weights:
             return messages, attn_weights
         return messages
@@ -748,20 +850,20 @@ def __init__(self, config):
         )
 
         self.pos_transform = None
-        if getattr(config, 'position_embeddings', False):
-            self.selector_size = getattr(config, 'selector_size', 0)
+        if getattr(config, "position_embeddings", False):
+            self.selector_size = getattr(config, "selector_size", 0)
             self.emb_size = config.emb_size
             self.position_embs = PositionEmbeddings()
-            
-            #legacy
-            #self.register_buffer(
+
+            # legacy
+            # self.register_buffer(
             #    "position_embs",
             #    PositionEmbeddings()(
             #        torch.arange(512, dtype=torch.get_default_dtype()),
             #        config.emb_size,
             #        dpad=getattr(config, 'selector_size', 0),
             #    ),
-            #)
+            # )
             self.pos_transform = LinearNet(
                 self.dim,
                 self.dim,
@@ -774,12 +876,14 @@ def forward(self, edge_lists, node_states, pos_lists=None):
 
         # all edge types are handled in one matrix, but we
         # let propagated_states[i] be equal to the case with only edge_type i
-        #propagated_states = (
+        # propagated_states = (
         #    self.transform(node_states)
         #    .transpose(0, 1)
         #    .view(self.edge_type_count, self.dim, -1)
-        #)
-        propagated_states = self.transform(node_states).chunk(self.edge_type_count, dim=1)
+        # )
+        propagated_states = self.transform(node_states).chunk(
+            self.edge_type_count, dim=1
+        )
 
         messages_by_targets = torch.zeros_like(node_states)
         if self.msg_mean_aggregation:
@@ -792,19 +896,27 @@ def forward(self, edge_lists, node_states, pos_lists=None):
             edge_targets = edge_list[:, 1]
             edge_sources = edge_list[:, 0]
 
-            #messages_by_source = F.embedding(
+            # messages_by_source = F.embedding(
             #    edge_sources, propagated_states[i].transpose(0, 1)
-            #)
-            messages_by_source = torch.index_select(propagated_states[i], dim=0, index=edge_sources)
+            # )
+            messages_by_source = torch.index_select(
+                propagated_states[i], dim=0, index=edge_sources
+            )
 
             if self.pos_transform:
                 pos_list = pos_lists[i]
                 # torch.index_select(pos_gating, dim=0, index=pos_list)
-                pos_by_source = self.position_embs(pos_list.to(dtype=torch.get_default_dtype()), self.emb_size, dpad=self.selector_size)
-                
-                pos_gating_by_source = 2 * torch.sigmoid(self.pos_transform(pos_by_source))
-                
-                #messages_by_source.mul_(pos_by_source)
+                pos_by_source = self.position_embs(
+                    pos_list.to(dtype=torch.get_default_dtype()),
+                    self.emb_size,
+                    dpad=self.selector_size,
+                )
+
+                pos_gating_by_source = 2 * torch.sigmoid(
+                    self.pos_transform(pos_by_source)
+                )
+
+                # messages_by_source.mul_(pos_by_source)
                 messages_by_source = messages_by_source * pos_gating_by_source
 
             messages_by_targets.index_add_(0, edge_targets, messages_by_source)
@@ -816,8 +928,10 @@ def forward(self, edge_lists, node_states, pos_lists=None):
         if self.msg_mean_aggregation:
             divisor = bincount.float()
             divisor[bincount == 0] = 1.0  # avoid div by zero for lonely nodes
-            #messages_by_targets /= divisor.unsqueeze_(1) + SMALL_NUMBER
-            messages_by_targets = messages_by_targets / divisor.unsqueeze_(1) + SMALL_NUMBER
+            # messages_by_targets /= divisor.unsqueeze_(1) + SMALL_NUMBER
+            messages_by_targets = (
+                messages_by_targets / divisor.unsqueeze_(1) + SMALL_NUMBER
+            )
 
         return messages_by_targets
 
@@ -828,16 +942,16 @@ def __init__(self):
 
     def forward(self, positions, demb, dpad: int = 0):
         """Transformer-like sinusoidal positional embeddings.
-                Args:
-                position: 1d long Tensor of positions,
-                demb: int    size of embedding vector
-            """
-        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0, device=positions.device) / demb))
+        Args:
+        position: 1d long Tensor of positions,
+        demb: int    size of embedding vector
+        """
+        inv_freq = 1 / (
+            10000 ** (torch.arange(0.0, demb, 2.0, device=positions.device) / demb)
+        )
 
         sinusoid_inp = torch.ger(positions, inv_freq)
-        pos_emb = torch.cat(
-            (torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1
-        )
+        pos_emb = torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
 
         if dpad > 0:
             in_length = positions.size()[0]
@@ -863,6 +977,7 @@ def forward(self, positions, demb, dpad: int = 0):
 
 #######  Update Layers
 
+
 class GGNNUpdateLayer(nn.Module):
     """GRU update function of GGNN architecture, optionally distinguishing two kinds of node types.
     Args:
@@ -872,6 +987,7 @@ class GGNNUpdateLayer(nn.Module):
     Returns:
         updated node_states <N, D+S>
     """
+
     def __init__(self, config):
         super().__init__()
         self.dropout = config.graph_state_dropout
@@ -884,7 +1000,7 @@ def __init__(self, config):
         )
 
         # currently only admits node types 0 and 1 for statements and identifiers.
-        self.use_node_types = getattr(config, 'use_node_types', False)
+        self.use_node_types = getattr(config, "use_node_types", False)
         if self.use_node_types:
             self.id_gru = nn.GRUCell(
                 input_size=config.hidden_size, hidden_size=config.hidden_size
@@ -892,7 +1008,9 @@ def __init__(self, config):
 
     def forward(self, messages, node_states, node_types=None):
         if self.use_node_types:
-            assert node_types is not None, "Need to provide node_types <N> if config.use_node_types!"
+            assert (
+                node_types is not None
+            ), "Need to provide node_types <N> if config.use_node_types!"
             output = torch.zeros_like(node_states, device=node_states.device)
             stmt_mask = node_types == 0
             output[stmt_mask] = self.gru(messages[stmt_mask], node_states[stmt_mask])
@@ -905,6 +1023,7 @@ def forward(self, messages, node_states, node_types=None):
             F.dropout(output, p=self.dropout, training=self.training, inplace=True)
         return output
 
+
 class TransformerUpdateLayer(nn.Module):
     """Represents the residual MLP around the self-attention in the transformer
     encoder layer. The implementation is sparse for usage in GNNs.
@@ -916,14 +1035,15 @@ class TransformerUpdateLayer(nn.Module):
     Returns:
         updated node_states
     """
+
     def __init__(self, config):
         super().__init__()
-        self.use_node_types = getattr(config, 'use_node_types', False)
+        self.use_node_types = getattr(config, "use_node_types", False)
         assert not self.use_node_types, "not implemented"
 
-        activation = config.tfmr_act # relu or gelu, default relu
-        dropout = config.tfmr_dropout # default 0.1
-        dim_feedforward = config.tfmr_ff_sz # ~ 2.5 * model dim
+        activation = config.tfmr_act  # relu or gelu, default relu
+        dropout = config.tfmr_dropout  # default 0.1
+        dim_feedforward = config.tfmr_ff_sz  # ~ 2.5 * model dim
 
         # Implementation of Feedforward model
         self.linear1 = nn.Linear(config.hidden_size, dim_feedforward)
@@ -948,14 +1068,16 @@ def get_activation_fn(self, activation):
     def forward(self, messages, node_states, node_types=None):
 
         # message layer is elsewhere!
-        #messages = self.self_attn(src, src, src)[0]
+        # messages = self.self_attn(src, src, src)[0]
 
         # 1st 'Add & Norm' block (cf. vaswani et al. 2017, fig. 1)
         node_states = node_states + self.dropout1(messages)
         node_states = self.norm1(node_states)
 
         # 'Feed Forward' block
-        messages = self.linear2(self.dropout(self.activation(self.linear1(node_states))))
+        messages = self.linear2(
+            self.dropout(self.activation(self.linear1(node_states)))
+        )
 
         # 2nd 'Add & Norm' block
         node_states = node_states + self.dropout2(messages)
@@ -976,17 +1098,28 @@ def __init__(self, config):
         super().__init__()
         self.has_graph_labels = config.has_graph_labels
         self.num_classes = config.num_classes
-        self.use_tanh_readout = getattr(config, 'use_tanh_readout', False)
+        self.use_tanh_readout = getattr(config, "use_tanh_readout", False)
 
         self.regression_gate = LinearNet(
-            2 * config.hidden_size, self.num_classes, dropout=config.output_dropout,
+            2 * config.hidden_size,
+            self.num_classes,
+            dropout=config.output_dropout,
         )
         self.regression_transform = LinearNet(
-            config.hidden_size, self.num_classes, dropout=config.output_dropout,
+            config.hidden_size,
+            self.num_classes,
+            dropout=config.output_dropout,
         )
 
-    def forward(self, raw_node_in, raw_node_out, graph_nodes_list=None,
-                num_graphs=None, auxiliary_features=None, readout_mask=None):
+    def forward(
+        self,
+        raw_node_in,
+        raw_node_out,
+        graph_nodes_list=None,
+        num_graphs=None,
+        auxiliary_features=None,
+        readout_mask=None,
+    ):
         if readout_mask is not None:
             # mask first to only process the stuff that goes into the loss function!
             raw_node_in = raw_node_in[readout_mask]
@@ -999,11 +1132,15 @@ def forward(self, raw_node_in, raw_node_out, graph_nodes_list=None,
         if not self.use_tanh_readout:
             nodewise_readout = gating * self.regression_transform(raw_node_out)
         else:
-            nodewise_readout = gating * torch.tanh(self.regression_transform(raw_node_out))
-        
+            nodewise_readout = gating * torch.tanh(
+                self.regression_transform(raw_node_out)
+            )
+
         graph_readout = None
         if self.has_graph_labels:
-            assert graph_nodes_list is not None and num_graphs is not None, 'has_graph_labels requires graph_nodes_list and num_graphs tensors.'
+            assert (
+                graph_nodes_list is not None and num_graphs is not None
+            ), "has_graph_labels requires graph_nodes_list and num_graphs tensors."
             # aggregate via sums over graphs
             device = raw_node_out.device
             graph_readout = torch.zeros(num_graphs, self.num_classes, device=device)
@@ -1063,13 +1200,17 @@ def forward(self, input):
 
     def extra_repr(self):
         return "in_features={}, out_features={}, bias={}, dropout={}".format(
-            self.in_features, self.out_features, self.bias is not None, self.dropout,
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.dropout,
         )
 
 
 ###########################################
 # Mixing in graph-level features to readout
 
+
 class AuxiliaryReadout(nn.Module):
     """Produces per-graph predictions by combining
     the per-graph predictions with auxiliary features.
@@ -1081,14 +1222,15 @@ def __init__(self, config):
         self.num_classes = config.num_classes
         self.aux_in_log1p = getattr(config, "aux_in_log1p", False)
         assert (
-        config.has_graph_labels
+            config.has_graph_labels
         ), "We expect aux readout in combination with graph labels, not node labels"
         self.feed_forward = None
 
         self.batch_norm = nn.BatchNorm1d(config.num_classes + config.aux_in_size)
         self.feed_forward = nn.Sequential(
             nn.Linear(
-                config.num_classes + config.aux_in_size, config.aux_in_layer_size,
+                config.num_classes + config.aux_in_size,
+                config.aux_in_layer_size,
             ),
             nn.ReLU(),
             nn.Dropout(config.output_dropout),
@@ -1097,7 +1239,7 @@ def __init__(self, config):
 
     def forward(self, graph_features, auxiliary_features):
         assert (
-        graph_features.size()[0] == auxiliary_features.size()[0]
+            graph_features.size()[0] == auxiliary_features.size()[0]
         ), "every graph needs aux_features. Dimension mismatch."
         if self.aux_in_log1p:
             auxiliary_features.log1p_()
@@ -1109,7 +1251,6 @@ def forward(self, graph_features, auxiliary_features):
         return out, graph_features
 
 
-
 class BetterAuxiliaryReadout(nn.Module):
     """Produces per-graph predictions by combining
     the raw GNN Encoder output with auxiliary features.
@@ -1122,8 +1263,9 @@ def __init__(self, config):
         super().__init__()
 
         self.aux_in_log1p = getattr(config, "aux_in_log1p", False)
-        assert config.has_graph_labels, \
-            "We expect aux readout in combination with graph labels, not node labels"
+        assert (
+            config.has_graph_labels
+        ), "We expect aux readout in combination with graph labels, not node labels"
 
         self.has_graph_labels = config.has_graph_labels
         self.num_classes = config.num_classes
@@ -1131,22 +1273,36 @@ def __init__(self, config):
         # now with aux_in concat'ed and batchnorm
         self.regression_gate = nn.Sequential(
             nn.BatchNorm1d(2 * config.hidden_size + config.aux_in_size),
-            LinearNet(2 * config.hidden_size + config.aux_in_size,
-                      self.num_classes, dropout=config.output_dropout,
-            )
+            LinearNet(
+                2 * config.hidden_size + config.aux_in_size,
+                self.num_classes,
+                dropout=config.output_dropout,
+            ),
         )
         # now with aux_in concat'ed and with intermediate layer
         self.regression_transform = nn.Sequential(
             nn.BatchNorm1d(config.hidden_size + config.aux_in_size),
-            LinearNet(config.hidden_size + config.aux_in_size,
-                      config.aux_in_layer_size, dropout=config.output_dropout,
+            LinearNet(
+                config.hidden_size + config.aux_in_size,
+                config.aux_in_layer_size,
+                dropout=config.output_dropout,
             ),
             nn.ReLU(),
             LinearNet(config.aux_in_layer_size, config.num_classes),
         )
 
-    def forward(self, raw_node_in, raw_node_out, graph_nodes_list, num_graphs, auxiliary_features, readout_mask=None):
-        assert graph_nodes_list is not None and auxiliary_features is not None, 'need those'
+    def forward(
+        self,
+        raw_node_in,
+        raw_node_out,
+        graph_nodes_list,
+        num_graphs,
+        auxiliary_features,
+        readout_mask=None,
+    ):
+        assert (
+            graph_nodes_list is not None and auxiliary_features is not None
+        ), "need those"
         if readout_mask is not None:
             # mask first to only process the stuff that goes into the loss function!
             raw_node_in = raw_node_in[readout_mask]
@@ -1156,7 +1312,9 @@ def forward(self, raw_node_in, raw_node_out, graph_nodes_list, num_graphs, auxil
 
         if self.aux_in_log1p:
             auxiliary_features.log1p_()
-        aux_by_node = torch.index_select(auxiliary_features, dim=0, index=graph_nodes_list)
+        aux_by_node = torch.index_select(
+            auxiliary_features, dim=0, index=graph_nodes_list
+        )
 
         # info: the gate and regression include batch norm inside!
         gate_input = torch.cat((raw_node_in, raw_node_out, aux_by_node), dim=-1)
@@ -1166,7 +1324,9 @@ def forward(self, raw_node_in, raw_node_out, graph_nodes_list, num_graphs, auxil
 
         graph_readout = None
         if self.has_graph_labels:
-            assert graph_nodes_list is not None and num_graphs is not None, 'has_graph_labels requires graph_nodes_list and num_graphs tensors.'
+            assert (
+                graph_nodes_list is not None and num_graphs is not None
+            ), "has_graph_labels requires graph_nodes_list and num_graphs tensors."
             # aggregate via sums over graphs
             device = raw_node_out.device
             graph_readout = torch.zeros(num_graphs, self.num_classes, device=device)
@@ -1179,7 +1339,7 @@ def forward(self, raw_node_in, raw_node_out, graph_nodes_list, num_graphs, auxil
 ############################
 # GNN Input: Embedding Layers
 ############################
-#class NodeEmbeddingsForPretraining(nn.Module):
+# class NodeEmbeddingsForPretraining(nn.Module):
 #    """NodeEmbeddings with added embedding for [MASK] token."""
 #
 #    def __init__(self, config):
@@ -1220,7 +1380,9 @@ def __init__(self, config, pretrained_embeddings=None):
         if config.inst2vec_embeddings == "constant":
             print("Using pre-trained inst2vec embeddings frozen.")
             assert pretrained_embeddings is not None
-            assert pretrained_embeddings.size()[0] == 8568, "Wrong number of embs; don't come here with MLM models!"
+            assert (
+                pretrained_embeddings.size()[0] == 8568
+            ), "Wrong number of embs; don't come here with MLM models!"
             self.node_embs = nn.Embedding.from_pretrained(
                 pretrained_embeddings, freeze=True
             )
@@ -1233,7 +1395,9 @@ def __init__(self, config, pretrained_embeddings=None):
         elif config.inst2vec_embeddings == "finetune":
             print("Fine-tuning inst2vec embeddings")
             assert pretrained_embeddings is not None
-            assert pretrained_embeddings.size()[0] == 8568, "Wrong number of embs; don't come here with MLM models!"
+            assert (
+                pretrained_embeddings.size()[0] == 8568
+            ), "Wrong number of embs; don't come here with MLM models!"
             self.node_embs = nn.Embedding.from_pretrained(
                 pretrained_embeddings, freeze=False
             )
@@ -1246,9 +1410,8 @@ def __init__(self, config, pretrained_embeddings=None):
         else:
             raise NotImplementedError(config.inst2vec_embeddings)
 
-
     def forward(self, vocab_ids, *ignored_args, **ignored_kwargs):
-        if self.inst2vec_embeddings == 'none':
+        if self.inst2vec_embeddings == "none":
             # map IDs to 1 and everything else to 0
             ids = (vocab_ids == 8565).to(torch.long)  # !IDENTIFIER token id
             embs = self.node_embs(ids)
@@ -1273,11 +1436,14 @@ class NodeEmbeddingsWithSelectors(NodeEmbeddings):
     Returns:
     node_states: <N, config.hidden_size>
     """
+
     def __init__(self, config, pretrained_embeddings=None):
         super().__init__(config, pretrained_embeddings)
 
         self.node_embs = super().forward
-        assert config.use_selector_embeddings, "This Module is for use with use_selector_embeddings!"
+        assert (
+            config.use_selector_embeddings
+        ), "This Module is for use with use_selector_embeddings!"
 
         selector_init = torch.tensor(
             # TODO(github.com/ChrisCummins/ProGraML/issues/27): x50 is maybe a
@@ -1285,9 +1451,7 @@ def __init__(self, config, pretrained_embeddings=None):
             [[0, 50.0], [50.0, 0]],
             dtype=torch.get_default_dtype(),
         )
-        self.selector_embs = nn.Embedding.from_pretrained(
-            selector_init, freeze=True
-        )
+        self.selector_embs = nn.Embedding.from_pretrained(selector_init, freeze=True)
 
     def forward(self, vocab_ids, selector_ids):
         node_embs = self.node_embs(vocab_ids)
@@ -1317,13 +1481,13 @@ def __init__(self, config):
             # class labels '-1' don't contribute to the gradient!
             # however in most cases it will be more efficient to gather
             # the relevant data into a dense tensor
-            self.loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
-            #loss = F.nll_loss(
+            self.loss = nn.CrossEntropyLoss(ignore_index=-1, reduction="mean")
+            # loss = F.nll_loss(
             #    F.log_softmax(logits, dim=-1, dtype=torch.float32),
             #    targets,
             #    reduction='mean',
             #    ignore_index=-1,
-            #)
+            # )
 
     def forward(self, logits, targets):
         """inputs: (logits) or (logits, intermediate_logits)"""
@@ -1331,7 +1495,7 @@ def forward(self, logits, targets):
             l = torch.sigmoid(logits[0])
             logits = (l, logits[1])
         loss = self.loss(logits[0].squeeze(dim=1), targets)
-        if getattr(self.config, 'has_aux_input', False):
+        if getattr(self.config, "has_aux_input", False):
             loss = loss + self.config.intermediate_loss_weight * self.loss(
                 logits[1], targets
             )
@@ -1355,8 +1519,9 @@ def forward(self, logits, labels, runtimes=None):
         elif len(labels.size()) == 1:
             targets = labels
         else:
-            raise ValueError(f"labels={labels.size()} tensor is is neither 1 nor 2-dimensional. :/")
-
+            raise ValueError(
+                f"labels={labels.size()} tensor is is neither 1 nor 2-dimensional. :/"
+            )
 
         pred_targets = logits.argmax(dim=1)
         correct_preds = targets.eq(pred_targets).float()
@@ -1365,14 +1530,17 @@ def forward(self, logits, labels, runtimes=None):
         ret = accuracy, correct_preds, targets
 
         if runtimes is not None:
-            assert runtimes.size() == logits.size(), \
-                f"We need to have a runtime for each sample and every possible label!" \
+            assert runtimes.size() == logits.size(), (
+                f"We need to have a runtime for each sample and every possible label!"
                 f"runtimes={runtimes.size()}, logits={logits.size()}."
-            #actual = runtimes[pred#torch.index_select(runtimes, dim=1, index=pred_targets)
-            actual = torch.gather(runtimes, dim=1, index=pred_targets.view(-1, 1)).squeeze()
-            #actual = runtimes[:, pred_targets]
+            )
+            # actual = runtimes[pred#torch.index_select(runtimes, dim=1, index=pred_targets)
+            actual = torch.gather(
+                runtimes, dim=1, index=pred_targets.view(-1, 1)
+            ).squeeze()
+            # actual = runtimes[:, pred_targets]
             optimal = torch.gather(runtimes, dim=1, index=targets.view(-1, 1)).squeeze()
-            #optimal = runtimes[:, targets]
+            # optimal = runtimes[:, targets]
             ret += (actual, optimal)
 
         return ret
diff --git a/programl/task/graph_level_classification/run.py b/programl/task/graph_level_classification/run.py
index 9c399718d..1369fc944 100644
--- a/programl/task/graph_level_classification/run.py
+++ b/programl/task/graph_level_classification/run.py
@@ -34,100 +34,118 @@
 """
 
 
-import time, os, json, sys
+import json
+import os
+import sys
+import time
 from pathlib import Path
 
-from docopt import docopt
-import tqdm
 import numpy as np
 import torch
-from torch_geometric.data import DataLoader # (see below)
+import tqdm
+from docopt import docopt
+from torch_geometric.data import DataLoader  # (see below)
 
 # make this file executable from anywhere
 full_path = os.path.realpath(__file__)
 print(full_path)
-REPO_ROOT = full_path.rsplit('ProGraML', maxsplit=1)[0] + 'ProGraML'
+REPO_ROOT = full_path.rsplit("ProGraML", maxsplit=1)[0] + "ProGraML"
 print(REPO_ROOT)
-#insert at 1, 0 is the script path (or '' in REPL)
+# insert at 1, 0 is the script path (or '' in REPL)
 sys.path.insert(1, REPO_ROOT)
 REPO_ROOT = Path(REPO_ROOT)
 
-from .dataloader import NodeLimitedDataLoader
-
-from .modeling import (
-    GGNNModel,
-    GraphTransformerModel,
-)
+# Importing twice like this enables restoring
+from . import configs, modeling
 from .configs import (
-    ProGraMLBaseConfig,
-    GGNN_POJ104_Config,
-    GGNN_ForPretraining_Config,
+    GGNN_BranchPrediction_Config,
     GGNN_Devmap_Config,
+    GGNN_ForPretraining_Config,
+    GGNN_POJ104_Config,
     GGNN_Threadcoarsening_Config,
-    GGNN_BranchPrediction_Config,
-    GraphTransformer_POJ104_Config,
-    GraphTransformer_Devmap_Config,
-    GraphTransformer_Threadcoarsening_Config,
     GraphTransformer_BranchPrediction_Config,
+    GraphTransformer_Devmap_Config,
     GraphTransformer_ForPretraining_Config,
+    GraphTransformer_POJ104_Config,
+    GraphTransformer_Threadcoarsening_Config,
+    ProGraMLBaseConfig,
 )
-
+from .dataloader import NodeLimitedDataLoader
 from .dataset import (
-    POJ104Dataset,
+    BranchPredictionDataset,
+    DevmapDataset,
     NCCDataset,
+    POJ104Dataset,
     ThreadcoarseningDataset,
-    DevmapDataset,
-    BranchPredictionDataset,
 )
-
-# Importing twice like this enables restoring
-from . import modeling
-from . import configs
-
-
-
+from .modeling import GGNNModel, GraphTransformerModel
 
 # Slurm gives us among others: SLURM_JOBID, SLURM_JOB_NAME,
 # SLURM_JOB_DEPENDENCY (set to the value of the --dependency option)
-if os.environ.get('SLURM_JOBID'):
-    print('SLURM_JOB_NAME', os.environ.get('SLURM_JOB_NAME', ''))
-    print('SLURM_JOBID', os.environ.get('SLURM_JOBID', ''))
-    RUN_ID = "_".join([os.environ.get('SLURM_JOB_NAME', ''), os.environ.get('SLURM_JOBID')])
+if os.environ.get("SLURM_JOBID"):
+    print("SLURM_JOB_NAME", os.environ.get("SLURM_JOB_NAME", ""))
+    print("SLURM_JOBID", os.environ.get("SLURM_JOBID", ""))
+    RUN_ID = "_".join(
+        [os.environ.get("SLURM_JOB_NAME", ""), os.environ.get("SLURM_JOBID")]
+    )
 else:
     RUN_ID = str(os.getpid())
 
 
-
-
 MODEL_CLASSES = {
-    'ggnn_poj104': (GGNNModel, GGNN_POJ104_Config),
-    'ggnn_devmap': (GGNNModel, GGNN_Devmap_Config),
-    'ggnn_threadcoarsening': (GGNNModel, GGNN_Threadcoarsening_Config),
-    'ggnn_branch_prediction': (GGNNModel, GGNN_BranchPrediction_Config),
-    'ggnn_pretraining': (GGNNModel, GGNN_ForPretraining_Config),
-    'transformer_poj104': (GraphTransformerModel, GraphTransformer_POJ104_Config),
-    'transformer_devmap': (GraphTransformerModel, GraphTransformer_Devmap_Config),
-    'transformer_threadcoarsening': (GraphTransformerModel, GraphTransformer_Threadcoarsening_Config),
-    'transformer_branch_prediction': (GraphTransformerModel, GraphTransformer_BranchPrediction_Config),
-    'transformer_pretraining': (GraphTransformerModel, GraphTransformer_ForPretraining_Config),
+    "ggnn_poj104": (GGNNModel, GGNN_POJ104_Config),
+    "ggnn_devmap": (GGNNModel, GGNN_Devmap_Config),
+    "ggnn_threadcoarsening": (GGNNModel, GGNN_Threadcoarsening_Config),
+    "ggnn_branch_prediction": (GGNNModel, GGNN_BranchPrediction_Config),
+    "ggnn_pretraining": (GGNNModel, GGNN_ForPretraining_Config),
+    "transformer_poj104": (GraphTransformerModel, GraphTransformer_POJ104_Config),
+    "transformer_devmap": (GraphTransformerModel, GraphTransformer_Devmap_Config),
+    "transformer_threadcoarsening": (
+        GraphTransformerModel,
+        GraphTransformer_Threadcoarsening_Config,
+    ),
+    "transformer_branch_prediction": (
+        GraphTransformerModel,
+        GraphTransformer_BranchPrediction_Config,
+    ),
+    "transformer_pretraining": (
+        GraphTransformerModel,
+        GraphTransformer_ForPretraining_Config,
+    ),
 }
 
-DATASET_CLASSES = { #DS, default data_dir,
-    'poj104': (POJ104Dataset, 'deeplearning/ml4pl/poj104/classifyapp_data'),
-    'ncc': (NCCDataset, 'deeplearning/ml4pl/poj104/ncc_data'),
-    'devmap_amd': (DevmapDataset, 'deeplearning/ml4pl/poj104/devmap_data'),
-    'devmap_nvidia': (DevmapDataset, 'deeplearning/ml4pl/poj104/devmap_data'),
-    'threadcoarsening_Cypress': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
-    'threadcoarsening_Tahiti': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
-    'threadcoarsening_Fermi': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
-    'threadcoarsening_Kepler': (ThreadcoarseningDataset, 'deeplearning/ml4pl/poj104/threadcoarsening_data'),
-    'branch_prediction': (BranchPredictionDataset, 'deeplearning/ml4pl/poj104/branch_prediction_data'),
+DATASET_CLASSES = {  # DS, default data_dir,
+    "poj104": (POJ104Dataset, "deeplearning/ml4pl/poj104/classifyapp_data"),
+    "ncc": (NCCDataset, "deeplearning/ml4pl/poj104/ncc_data"),
+    "devmap_amd": (DevmapDataset, "deeplearning/ml4pl/poj104/devmap_data"),
+    "devmap_nvidia": (DevmapDataset, "deeplearning/ml4pl/poj104/devmap_data"),
+    "threadcoarsening_Cypress": (
+        ThreadcoarseningDataset,
+        "deeplearning/ml4pl/poj104/threadcoarsening_data",
+    ),
+    "threadcoarsening_Tahiti": (
+        ThreadcoarseningDataset,
+        "deeplearning/ml4pl/poj104/threadcoarsening_data",
+    ),
+    "threadcoarsening_Fermi": (
+        ThreadcoarseningDataset,
+        "deeplearning/ml4pl/poj104/threadcoarsening_data",
+    ),
+    "threadcoarsening_Kepler": (
+        ThreadcoarseningDataset,
+        "deeplearning/ml4pl/poj104/threadcoarsening_data",
+    ),
+    "branch_prediction": (
+        BranchPredictionDataset,
+        "deeplearning/ml4pl/poj104/branch_prediction_data",
+    ),
 }
 
 DEBUG = False
 if DEBUG:
     torch.autograd.set_detect_anomaly(True)
 
+
 class Learner(object):
     def __init__(self, model, dataset, args=None, current_kfold_split=None):
         # Make class work without file being run as main
@@ -138,23 +156,25 @@ def __init__(self, model, dataset, args=None, current_kfold_split=None):
         # prepare logging
         self.parent_run_id = None  # for restored models
         self.run_id = f"{time.strftime('%Y-%m-%d_%H:%M:%S')}_{RUN_ID}"
-        if args['--kfold']:
-            self.run_id += f'_{current_kfold_split}'
+        if args["--kfold"]:
+            self.run_id += f"_{current_kfold_split}"
 
-        log_dir = REPO_ROOT / self.args.get("--log_dir", '.')
+        log_dir = REPO_ROOT / self.args.get("--log_dir", ".")
         log_dir.mkdir(parents=True, exist_ok=True)
         self.log_file = log_dir / f"{self.run_id}_log.json"
         self.best_model_file = log_dir / f"{self.run_id}_model_best.pickle"
         self.last_model_file = log_dir / f"{self.run_id}_model_last.pickle"
 
         # ~~~~~~~~~~ load model ~~~~~~~~~~~~~
-        if self.args.get('--restore'):
-            self.model = self.restore_model(path=REPO_ROOT / self.args['--restore'])
-        elif self.args.get('--restore_by_pattern'):
-            self.model = self.restore_by_pattern(pattern=self.args['--restore_by_pattern'],
-                                                 log_dir=log_dir,
-                                                 current_kfold_split=current_kfold_split)
-        else: # initialize fresh model
+        if self.args.get("--restore"):
+            self.model = self.restore_model(path=REPO_ROOT / self.args["--restore"])
+        elif self.args.get("--restore_by_pattern"):
+            self.model = self.restore_by_pattern(
+                pattern=self.args["--restore_by_pattern"],
+                log_dir=log_dir,
+                current_kfold_split=current_kfold_split,
+            )
+        else:  # initialize fresh model
             # get model and dataset
             assert model, "Need to provide --model to initialize freshly."
             Model, Config = MODEL_CLASSES[model]
@@ -166,7 +186,7 @@ def __init__(self, model, dataset, args=None, current_kfold_split=None):
             params = self.parse_config_params(args)
             self.config = Config.from_dict(params=params)
 
-            test_only = self.args.get('--test', False)
+            test_only = self.args.get("--test", False)
             self.model = Model(self.config, test_only=test_only)
 
         # set seeds, NB: the NN on CUDA is partially non-deterministic!
@@ -174,12 +194,11 @@ def __init__(self, model, dataset, args=None, current_kfold_split=None):
         np.random.seed(self.config.random_seed)
 
         # ~~~~~~~~~~ transfer model ~~~~~~~~
-        if self.args['--transfer'] is not None:
-            self.transfer_model(self.args['--transfer'], self.args['--transfer_mode'])
-
+        if self.args["--transfer"] is not None:
+            self.transfer_model(self.args["--transfer"], self.args["--transfer_mode"])
 
         # ~~~~~~~~~~ load data ~~~~~~~~~~~~~
-        self.load_data(dataset, args['--kfold'], current_kfold_split)
+        self.load_data(dataset, args["--kfold"], current_kfold_split)
 
         # log config to file
         config_dict = self.config.to_dict()
@@ -189,11 +208,14 @@ def __init__(self, model, dataset, args=None, current_kfold_split=None):
         # log parent run to file if run was restored
         if self.parent_run_id:
             with open(log_dir / f"{self.run_id}_parent.json", "w") as f:
-                json.dump({
-                    "parent": self.parent_run_id,
-                    "self": self.run_id,
-                    "self_config": config_dict,
-                }, f)
+                json.dump(
+                    {
+                        "parent": self.parent_run_id,
+                        "self": self.run_id,
+                        "self_config": config_dict,
+                    },
+                    f,
+                )
 
         print(
             "Run %s starting with following parameters:\n%s"
@@ -202,131 +224,184 @@ def __init__(self, model, dataset, args=None, current_kfold_split=None):
 
     def load_data(self, dataset, kfold, current_kfold_split):
         """Set self.train_data, self.test_data, self.valid_data depending on the dataset used."""
-        if not kfold: assert current_kfold_split is None
-        if '_' in dataset:
-            split = dataset.rsplit('_', maxsplit=1)[-1]
+        if not kfold:
+            assert current_kfold_split is None
+        if "_" in dataset:
+            split = dataset.rsplit("_", maxsplit=1)[-1]
         Dataset, data_dir = DATASET_CLASSES[dataset]
-        if self.args.get('--data_dir', '.'):
-            self.data_dir = REPO_ROOT / self.args.get('--data_dir', '.')
+        if self.args.get("--data_dir", "."):
+            self.data_dir = REPO_ROOT / self.args.get("--data_dir", ".")
         else:
             self.data_dir = REPO_ROOT / data_dir
 
         # Switch cases by dataset
         # ~~~~~~~~~~ NCC ~~~~~~~~~~~~~~~~~~~~~
-        if dataset == 'ncc':
+        if dataset == "ncc":
             # train set
-            if not self.args.get('--test'):
+            if not self.args.get("--test"):
                 # take train_subset=[90,100] as validation data
                 if self.config.train_subset == [0, 100]:
                     print(f"!!!!!!!!  WARNING !!!!!!!!!!!!")
                     print(f"SETTING TRAIN_SUBSET FROM [0,100] TO [0, 90]")
                     print(f"!!!!!!!!  WARNING !!!!!!!!!!!!")
-                    self.config.train_subset = [0,90]
-                train_dataset = Dataset(root=self.data_dir, split='train', train_subset=self.config.train_subset)
-                train_dataset = train_dataset.filter_max_num_nodes(self.config.max_num_nodes)
-                self.train_data = NodeLimitedDataLoader(train_dataset,
-                                             batch_size=self.config.batch_size,
-                                             shuffle=True,
-                                             max_num_nodes=self.config.max_num_nodes,
-                                             warn_on_limit=True,
-                                            )
+                    self.config.train_subset = [0, 90]
+                train_dataset = Dataset(
+                    root=self.data_dir,
+                    split="train",
+                    train_subset=self.config.train_subset,
+                )
+                train_dataset = train_dataset.filter_max_num_nodes(
+                    self.config.max_num_nodes
+                )
+                self.train_data = NodeLimitedDataLoader(
+                    train_dataset,
+                    batch_size=self.config.batch_size,
+                    shuffle=True,
+                    max_num_nodes=self.config.max_num_nodes,
+                    warn_on_limit=True,
+                )
             # valid set (and test set)
-            valid_dataset = Dataset(root=self.data_dir, split='train', train_subset=[90,100])
-            valid_dataset = valid_dataset.filter_max_num_nodes(self.config.max_num_nodes)
-            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False)
+            valid_dataset = Dataset(
+                root=self.data_dir, split="train", train_subset=[90, 100]
+            )
+            valid_dataset = valid_dataset.filter_max_num_nodes(
+                self.config.max_num_nodes
+            )
+            self.valid_data = DataLoader(
+                valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False
+            )
             self.test_data = None
         # ~~~~~~~~~~ POJ 104 ~~~~~~~~~~~~~~~~~~~~~
-        elif dataset == 'poj104':
-            if not self.args.get('--test'):
-                train_dataset = Dataset(root=self.data_dir,
-                                        split='train',
-                                        train_subset=self.config.train_subset,
-                                        cdfg=self.config.cdfg_vocab,
-                                        ablation_vocab=self.config.ablation_vocab)
-                self.train_data = DataLoader(train_dataset,
-                                             batch_size=self.config.batch_size,
-                                             shuffle=True,
-                                             #max_num_nodes=self.config.max_num_nodes
-                                            )
-
-            self.valid_data = DataLoader(Dataset(root=self.data_dir,
-                                                 split='val',
-                                                 cdfg=self.config.cdfg_vocab,
-                                                 ablation_vocab=self.config.ablation_vocab),
-                                            batch_size=self.config.batch_size * 2,
-                                            shuffle=False
-                                         )
-            self.test_data = DataLoader(Dataset(root=self.data_dir,
-                                                split='test',
-                                                cdfg=self.config.cdfg_vocab,
-                                                ablation_vocab=self.config.ablation_vocab),
-                                            batch_size=self.config.batch_size * 2,
-                                            shuffle=False
-                                        )
+        elif dataset == "poj104":
+            if not self.args.get("--test"):
+                train_dataset = Dataset(
+                    root=self.data_dir,
+                    split="train",
+                    train_subset=self.config.train_subset,
+                    cdfg=self.config.cdfg_vocab,
+                    ablation_vocab=self.config.ablation_vocab,
+                )
+                self.train_data = DataLoader(
+                    train_dataset,
+                    batch_size=self.config.batch_size,
+                    shuffle=True,
+                    # max_num_nodes=self.config.max_num_nodes
+                )
+
+            self.valid_data = DataLoader(
+                Dataset(
+                    root=self.data_dir,
+                    split="val",
+                    cdfg=self.config.cdfg_vocab,
+                    ablation_vocab=self.config.ablation_vocab,
+                ),
+                batch_size=self.config.batch_size * 2,
+                shuffle=False,
+            )
+            self.test_data = DataLoader(
+                Dataset(
+                    root=self.data_dir,
+                    split="test",
+                    cdfg=self.config.cdfg_vocab,
+                    ablation_vocab=self.config.ablation_vocab,
+                ),
+                batch_size=self.config.batch_size * 2,
+                shuffle=False,
+            )
 
         # ~~~~~~~~~~ DEVMAP ~~~~~~~~~~~~~~~~~~~~~
-        elif dataset in ['devmap_amd', 'devmap_nvidia']:
-            assert kfold and current_kfold_split is not None, "Devmap only supported with kfold flag!"
+        elif dataset in ["devmap_amd", "devmap_nvidia"]:
+            assert (
+                kfold and current_kfold_split is not None
+            ), "Devmap only supported with kfold flag!"
             assert current_kfold_split < 10
             # get the whole dataset then get the correct split
-            ds = Dataset(root=self.data_dir,
-                         split=split,
-                         train_subset=self.config.train_subset,
-                         cdfg=self.config.cdfg,
-                         ablation_vocab=self.config.ablation_vocab)
-            train_dataset, valid_dataset = ds.return_cross_validation_splits(current_kfold_split)
+            ds = Dataset(
+                root=self.data_dir,
+                split=split,
+                train_subset=self.config.train_subset,
+                cdfg=self.config.cdfg,
+                ablation_vocab=self.config.ablation_vocab,
+            )
+            train_dataset, valid_dataset = ds.return_cross_validation_splits(
+                current_kfold_split
+            )
 
             self.train_data = None
-            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False)
+            self.valid_data = DataLoader(
+                valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False
+            )
 
             # only maybe set train_data.
-            if not self.args.get('--test'):
-                self.train_data = DataLoader(train_dataset,
-                                batch_size=self.config.batch_size,
-                                shuffle=True,
-                            )
+            if not self.args.get("--test"):
+                self.train_data = DataLoader(
+                    train_dataset,
+                    batch_size=self.config.batch_size,
+                    shuffle=True,
+                )
             self.test_data = None
 
         # ~~~~~~~~~~ THREADCOARSENING ~~~~~~~~~~~~~~~~~~~~~
-        elif dataset in ['threadcoarsening' + '_' + s for s in ['Cypress', 'Tahiti', 'Fermi', 'Kepler']]:
-            assert kfold and current_kfold_split is not None, "Threadcoarsening only supported with kfold flag!"
+        elif dataset in [
+            "threadcoarsening" + "_" + s
+            for s in ["Cypress", "Tahiti", "Fermi", "Kepler"]
+        ]:
+            assert (
+                kfold and current_kfold_split is not None
+            ), "Threadcoarsening only supported with kfold flag!"
             assert current_kfold_split < 17 and current_kfold_split >= 0
-            if not self.args.get('--test'):
+            if not self.args.get("--test"):
                 pass
             # get the whole dataset then get the correct split
-            ds = Dataset(root=self.data_dir, split=split, train_subset=self.config.train_subset)
-            train_dataset, valid_dataset = ds.return_cross_validation_splits(current_kfold_split)
+            ds = Dataset(
+                root=self.data_dir, split=split, train_subset=self.config.train_subset
+            )
+            train_dataset, valid_dataset = ds.return_cross_validation_splits(
+                current_kfold_split
+            )
 
             self.train_data = None
-            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False)
+            self.valid_data = DataLoader(
+                valid_dataset, batch_size=self.config.batch_size * 2, shuffle=False
+            )
 
             # only maybe set train_data.
-            if not self.args.get('--test'):
-                self.train_data = DataLoader(train_dataset,
-                                batch_size=self.config.batch_size,
-                                shuffle=True,
-                            )
+            if not self.args.get("--test"):
+                self.train_data = DataLoader(
+                    train_dataset,
+                    batch_size=self.config.batch_size,
+                    shuffle=True,
+                )
             self.test_data = None
 
         # ~~~~~~~~~~~~ Branch Prediction ~~~~~~~~~~~~~~~~~~~~
-        elif dataset in ['branch_prediction']:
-            assert kfold and current_kfold_split is not None, "Branch Prediction only supported with kfold flag!"
+        elif dataset in ["branch_prediction"]:
+            assert (
+                kfold and current_kfold_split is not None
+            ), "Branch Prediction only supported with kfold flag!"
             assert current_kfold_split < 10
             # train set
-            ds = Dataset(root=self.data_dir, split='train', train_subset=self.config.train_subset)
+            ds = Dataset(
+                root=self.data_dir, split="train", train_subset=self.config.train_subset
+            )
             ds = ds.filter_max_num_nodes(self.config.max_num_nodes)
-            
-            train_dataset, valid_dataset = ds.return_cross_validation_splits(current_kfold_split)
-            #train_dataset.filter_max_num_nodes(self.config.max_num_nodes)
-            #valid_dataset.filter_max_num_nodes(self.config.max_num_nodes)
-            self.train_data = NodeLimitedDataLoader(train_dataset,
-                    batch_size=self.config.batch_size,
-                    shuffle=True,
-                    max_num_nodes=self.config.max_num_nodes,
-                    warn_on_limit=False,
+
+            train_dataset, valid_dataset = ds.return_cross_validation_splits(
+                current_kfold_split
             )
-            self.valid_data = DataLoader(valid_dataset, batch_size=self.config.batch_size, shuffle=False)
-            
+            # train_dataset.filter_max_num_nodes(self.config.max_num_nodes)
+            # valid_dataset.filter_max_num_nodes(self.config.max_num_nodes)
+            self.train_data = NodeLimitedDataLoader(
+                train_dataset,
+                batch_size=self.config.batch_size,
+                shuffle=True,
+                max_num_nodes=self.config.max_num_nodes,
+                warn_on_limit=False,
+            )
+            self.valid_data = DataLoader(
+                valid_dataset, batch_size=self.config.batch_size, shuffle=False
+            )
+
             self.test_data = None
         # ~~~~~~~~~~~ Unknow Dataset ~~~~~~~~~~~~~~~~~
         else:
@@ -335,13 +410,18 @@ def load_data(self, dataset, kfold, current_kfold_split):
     def parse_config_params(self, args):
         """Accesses self.args to parse config params from various flags."""
         params = None
-        if args.get('--config'):
-            with open(REPO_ROOT / args['--config'], 'r') as f:
+        if args.get("--config"):
+            with open(REPO_ROOT / args["--config"], "r") as f:
                 params = json.load(f)
-        elif args.get('--config_json'):
-            config_string = args['--config_json']
+        elif args.get("--config_json"):
+            config_string = args["--config_json"]
             # accept single quoted 'json'. This only works bc our json strings are simple enough.
-            config_string = config_string.replace("\\'", "'").replace("'", '"').replace('True', 'true').replace('False', 'false')
+            config_string = (
+                config_string.replace("\\'", "'")
+                .replace("'", '"')
+                .replace("True", "true")
+                .replace("False", "false")
+            )
             params = json.loads(config_string)
         return params
 
@@ -350,55 +430,55 @@ def data2input(self, batch):
         num_graphs = batch.batch[-1].item() + 1
 
         edge_lists = []
-        edge_positions = [] if getattr(self.config, 'position_embeddings', False) else None
+        edge_positions = (
+            [] if getattr(self.config, "position_embeddings", False) else None
+        )
 
         edge_indices = list(range(3))
         if self.config.ablate_structure:
-            if self.config.ablate_structure == 'control':
+            if self.config.ablate_structure == "control":
                 edge_indices[0] = -1
-            elif self.config.ablate_structure == 'data':
+            elif self.config.ablate_structure == "data":
                 edge_indices[1] = -1
-            elif self.config.ablate_structure == 'call':
+            elif self.config.ablate_structure == "call":
                 edge_indices[2] = -1
             else:
                 raise ValueError("unreachable")
 
         for i in edge_indices:
             # mask by edge type
-            mask = batch.edge_attr[:, 0] == i          # <M_i>
+            mask = batch.edge_attr[:, 0] == i  # <M_i>
             edge_list = batch.edge_index[:, mask].t()  # <et, M_i>
             edge_lists.append(edge_list)
 
-            if getattr(self.config, 'position_embeddings', False):
-                edge_pos = batch.edge_attr[mask, 1]    # <M_i>
+            if getattr(self.config, "position_embeddings", False):
+                edge_pos = batch.edge_attr[mask, 1]  # <M_i>
                 edge_positions.append(edge_pos)
 
         inputs = {
-            "vocab_ids": batch.x[:,0],
+            "vocab_ids": batch.x[:, 0],
             "edge_lists": edge_lists,
             "pos_lists": edge_positions,
             "num_graphs": num_graphs,
             "graph_nodes_list": batch.batch,
-            "node_types": batch.x[:,1],
+            "node_types": batch.x[:, 1],
         }
 
         # maybe add labels
         if batch.y is not None:
-            inputs.update({
-                "labels": batch.y,
-            })
+            inputs.update(
+                {
+                    "labels": batch.y,
+                }
+            )
 
         # add other stuff
-        if hasattr(batch, 'aux_in'):
-           inputs.update({
-               "aux_in": batch.aux_in.to(dtype=torch.float)
-           })
-        if hasattr(batch, 'runtimes'):
-            inputs.update({
-                "runtimes": batch.runtimes.to(dtype=torch.float)
-            })
+        if hasattr(batch, "aux_in"):
+            inputs.update({"aux_in": batch.aux_in.to(dtype=torch.float)})
+        if hasattr(batch, "runtimes"):
+            inputs.update({"runtimes": batch.runtimes.to(dtype=torch.float)})
         return inputs
-    
+
     def make_branch_labels(self, batch):
         """takes a batch and maps the profile info to branch labels for regression:
         a branch has (true_weight+1, false_weight+1, total_weight+2) and we map to [0, 1] as
@@ -407,9 +487,13 @@ def make_branch_labels(self, batch):
         """
         mask = batch.profile_info[:, 0].bool()
         # clamp to be robust against 0 counts from problems with the data
-        yes = torch.clamp(batch.profile_info[:,1].to(dtype=torch.get_default_dtype()) - 1, min=0.0)
-        total = 1e-7 + torch.clamp(batch.profile_info[:,3].to(torch.get_default_dtype()) - 2, min=0.0)
-        p_yes = yes / total# true / total
+        yes = torch.clamp(
+            batch.profile_info[:, 1].to(dtype=torch.get_default_dtype()) - 1, min=0.0
+        )
+        total = 1e-7 + torch.clamp(
+            batch.profile_info[:, 3].to(torch.get_default_dtype()) - 2, min=0.0
+        )
+        p_yes = yes / total  # true / total
         p_yes = torch.clamp(p_yes, min=0.0, max=1.0)
         # print([str(a) for a in p_yes[mask].clone().detach().to('cpu').numpy()])
         return p_yes, mask
@@ -423,7 +507,9 @@ def bertify_batch(self, batch, config):
         device = vocab_ids.device
 
         # we create a tensor that carries the probability of being masked for each node
-        probabilities = torch.full(vocab_ids.size(), config.mlm_probability, device=device)
+        probabilities = torch.full(
+            vocab_ids.size(), config.mlm_probability, device=device
+        )
         # set to 0.0 where nodes are !IDENTIFIERS, i.e. node_types == 1
         if config.mlm_statements_only:
             probabilities.masked_fill_(node_types.bool(), 0.0)
@@ -434,15 +520,24 @@ def bertify_batch(self, batch, config):
         # get the node mask that determines the nodes we use as targets
         mlm_target_mask = torch.bernoulli(probabilities).bool()
         # of those, get the 80% where the input is masked
-        masked_out_nodes = torch.bernoulli(torch.full(vocab_ids.size(), 0.8, device=device)).bool() & mlm_target_mask
+        masked_out_nodes = (
+            torch.bernoulli(torch.full(vocab_ids.size(), 0.8, device=device)).bool()
+            & mlm_target_mask
+        )
 
         # the 10% where it's set to a random token
         # (as 50% of the target nodes that are not masked out)
-        random_nodes = torch.bernoulli(torch.full(vocab_ids.size(), 0.5, device=device)).bool() & mlm_target_mask & ~masked_out_nodes
+        random_nodes = (
+            torch.bernoulli(torch.full(vocab_ids.size(), 0.5, device=device)).bool()
+            & mlm_target_mask
+            & ~masked_out_nodes
+        )
         # and the 10% where it's the original id, we just leave alone.
 
         # apply the changes
-        random_ids = torch.randint(config.vocab_size, vocab_ids.shape, dtype=torch.long, device=device)
+        random_ids = torch.randint(
+            config.vocab_size, vocab_ids.shape, dtype=torch.long, device=device
+        )
         vocab_ids[masked_out_nodes] = config.mlm_mask_token_id
         vocab_ids[random_nodes] = random_ids[random_nodes]
         # the loss function can ignore -1 labels for gradients,
@@ -460,7 +555,7 @@ def run_epoch(self, loader, epoch_type, analysis_mode=False):
             loss, accuracy, instance_per_second
         """
 
-        bar = tqdm.tqdm(total=len(loader.dataset), smoothing=0.01, unit='inst')
+        bar = tqdm.tqdm(total=len(loader.dataset), smoothing=0.01, unit="inst")
         if analysis_mode:
             saved_outputs = []
 
@@ -476,32 +571,46 @@ def run_epoch(self, loader, epoch_type, analysis_mode=False):
             batch.to(self.model.dev)
 
             inputs = self.data2input(batch)
-            num_graphs = inputs['num_graphs']
-
-             # only implemented nodewise model are for pretraining currently
-            if self.config.name in ['GGNN_ForPretraining_Config', 'GraphTransformer_ForPretraining_Config']:
-                mlm_vocab_ids, mlm_labels, mlm_target_mask = self.bertify_batch(batch, self.config)
-                inputs.update({
-                    'vocab_ids': mlm_vocab_ids,
-                    'labels': mlm_labels,
-                    'readout_mask': mlm_target_mask,
-                })
+            num_graphs = inputs["num_graphs"]
+
+            # only implemented nodewise model are for pretraining currently
+            if self.config.name in [
+                "GGNN_ForPretraining_Config",
+                "GraphTransformer_ForPretraining_Config",
+            ]:
+                mlm_vocab_ids, mlm_labels, mlm_target_mask = self.bertify_batch(
+                    batch, self.config
+                )
+                inputs.update(
+                    {
+                        "vocab_ids": mlm_vocab_ids,
+                        "labels": mlm_labels,
+                        "readout_mask": mlm_target_mask,
+                    }
+                )
                 num_targets = torch.sum(mlm_target_mask.to(torch.long)).item()
-            elif self.config.name in ['GGNN_BranchPrediction_Config', 'GraphTransformer_BranchPrediction_Config']:
+            elif self.config.name in [
+                "GGNN_BranchPrediction_Config",
+                "GraphTransformer_BranchPrediction_Config",
+            ]:
                 y, mask = self.make_branch_labels(batch)
-                inputs.update({
-                    'labels': y,
-                    'readout_mask': mask,
-                })
+                inputs.update(
+                    {
+                        "labels": y,
+                        "readout_mask": mask,
+                    }
+                )
                 if not torch.any(mask):
-                    print('Warning: batch has no labels! skipping.......')
+                    print("Warning: batch has no labels! skipping.......")
                     continue
                 num_targets = torch.sum(mask.to(torch.long)).item()
             # elif: other nodewise configs go here!
-            elif getattr(self.config, 'has_graph_labels', False): # all graph models
+            elif getattr(self.config, "has_graph_labels", False):  # all graph models
                 num_targets = num_graphs
             else:
-                raise NotImplementedError("We don't have other nodewise models currently.")
+                raise NotImplementedError(
+                    "We don't have other nodewise models currently."
+                )
 
             predicted_targets += num_targets
             processed_graphs += num_graphs
@@ -526,13 +635,27 @@ def run_epoch(self, loader, epoch_type, analysis_mode=False):
                 # TODO I don't know whether the outputs are properly cloned, moved to cpu and detached or not.
                 saved_outputs.append(outputs)
 
-            if hasattr(batch, 'runtimes'):
-                (logits, accuracy, correct, targets, actual_rt, optimal_rt, graph_features, *unroll_stats,
+            if hasattr(batch, "runtimes"):
+                (
+                    logits,
+                    accuracy,
+                    correct,
+                    targets,
+                    actual_rt,
+                    optimal_rt,
+                    graph_features,
+                    *unroll_stats,
                 ) = outputs
                 epoch_actual_rt += torch.sum(actual_rt).item()
                 epoch_optimal_rt += torch.sum(optimal_rt).item()
             else:
-                (logits, accuracy, correct, targets, graph_features, *unroll_stats,
+                (
+                    logits,
+                    accuracy,
+                    correct,
+                    targets,
+                    graph_features,
+                    *unroll_stats,
                 ) = outputs
             loss = self.model.loss((logits, graph_features), targets)
 
@@ -563,7 +686,14 @@ def run_epoch(self, loader, epoch_type, analysis_mode=False):
         instance_per_sec = processed_graphs / (time.time() - start_time)
         epoch_perplexity = np.exp(mean_loss)
 
-        returns = (mean_loss, mean_accuracy, instance_per_sec, epoch_perplexity, epoch_actual_rt, epoch_optimal_rt)
+        returns = (
+            mean_loss,
+            mean_accuracy,
+            instance_per_sec,
+            epoch_perplexity,
+            epoch_actual_rt,
+            epoch_optimal_rt,
+        )
 
         if analysis_mode:
             returns += (saved_outputs,)
@@ -592,20 +722,44 @@ def train(self):
         for epoch in range(self.current_epoch, target_epoch):
             print(f"== Epoch {epoch}/{target_epoch}")
 
-            train_loss, train_acc, train_speed, train_ppl, train_art, train_ort = self.run_epoch(
-                self.train_data, "train"
-            )
+            (
+                train_loss,
+                train_acc,
+                train_speed,
+                train_ppl,
+                train_art,
+                train_ort,
+            ) = self.run_epoch(self.train_data, "train")
             print(
                 "\r\x1b[K Train: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f | runtime: %.1f opt: %.1f"
-                % (train_loss, f"{train_acc:.5f}", train_ppl, train_speed, train_art, train_ort)
+                % (
+                    train_loss,
+                    f"{train_acc:.5f}",
+                    train_ppl,
+                    train_speed,
+                    train_art,
+                    train_ort,
+                )
             )
 
-            valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort = self.run_epoch(
-                self.valid_data, "eval"
-            )
+            (
+                valid_loss,
+                valid_acc,
+                valid_speed,
+                valid_ppl,
+                valid_art,
+                valid_ort,
+            ) = self.run_epoch(self.valid_data, "eval")
             print(
                 "\r\x1b[K Valid: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f | runtime: %.1f opt: %.1f"
-                % (valid_loss, f"{valid_acc:.5f}", valid_ppl, valid_speed, valid_art, valid_ort)
+                % (
+                    valid_loss,
+                    f"{valid_acc:.5f}",
+                    valid_ppl,
+                    valid_speed,
+                    valid_art,
+                    valid_ort,
+                )
             )
 
             # maybe run test epoch
@@ -624,12 +778,28 @@ def train(self):
             log_entry = {
                 "epoch": epoch,
                 "time": epoch_time,
-                "train_results": (train_loss, train_acc, train_speed, train_ppl, train_art, train_ort),
-                "valid_results": (valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort),
+                "train_results": (
+                    train_loss,
+                    train_acc,
+                    train_speed,
+                    train_ppl,
+                    train_art,
+                    train_ort,
+                ),
+                "valid_results": (
+                    valid_loss,
+                    valid_acc,
+                    valid_speed,
+                    valid_ppl,
+                    valid_art,
+                    valid_ort,
+                ),
             }
 
             if self.test_data is not None:
-                log_entry.update({"test_results": (test_loss, test_acc, test_speed, test_ppl)})
+                log_entry.update(
+                    {"test_results": (test_loss, test_acc, test_speed, test_ppl)}
+                )
 
             log_to_save.append(log_entry)
 
@@ -652,7 +822,7 @@ def train(self):
                     % self.config.patience
                 )
                 break
-            if not self.args['--skip_save_every_epoch']:
+            if not self.args["--skip_save_every_epoch"]:
                 self.save_model(epoch, self.last_model_file)
         # save last model on finish of training
         self.save_model(epoch, self.last_model_file)
@@ -663,12 +833,24 @@ def test(self):
 
         print(f"== Epoch: Test only run.")
 
-        valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort = self.run_epoch(
-            self.valid_data, "eval"
-        )
+        (
+            valid_loss,
+            valid_acc,
+            valid_speed,
+            valid_ppl,
+            valid_art,
+            valid_ort,
+        ) = self.run_epoch(self.valid_data, "eval")
         print(
             "\r\x1b[K Valid: loss: %.5f | acc: %s | ppl: %s | instances/sec: %.2f | runtime: %.1f opt: %.1f"
-            % (valid_loss, f"{valid_acc:.5f}", valid_ppl, valid_speed, valid_art, valid_ort)
+            % (
+                valid_loss,
+                f"{valid_acc:.5f}",
+                valid_ppl,
+                valid_speed,
+                valid_art,
+                valid_ort,
+            )
         )
 
         if self.test_data is not None:
@@ -683,12 +865,21 @@ def test(self):
         epoch_time = time.time() - total_time_start
 
         log_entry = {
-            "epoch": 'test_only',
+            "epoch": "test_only",
             "time": epoch_time,
-            "valid_results": (valid_loss, valid_acc, valid_speed, valid_ppl, valid_art, valid_ort),
+            "valid_results": (
+                valid_loss,
+                valid_acc,
+                valid_speed,
+                valid_ppl,
+                valid_art,
+                valid_ort,
+            ),
         }
         if self.test_data is not None:
-            log_entry.update({"test_results": (test_loss, test_acc, test_speed, test_ppl)})
+            log_entry.update(
+                {"test_results": (test_loss, test_acc, test_speed, test_ppl)}
+            )
 
         log_to_save.append(log_entry)
         with open(self.log_file, "w") as f:
@@ -696,13 +887,13 @@ def test(self):
 
     def save_model(self, epoch, path):
         checkpoint = {
-            'run_id': self.run_id,
-            'global_training_step': self.global_training_step,
-            'epoch': epoch,
-            'config': self.config.to_dict(),
-            'model_name': self.model.__class__.__name__,
-            'model_state_dict': self.model.state_dict(),
-            'optimizer_state_dict': self.model.opt.state_dict(),
+            "run_id": self.run_id,
+            "global_training_step": self.global_training_step,
+            "epoch": epoch,
+            "config": self.config.to_dict(),
+            "model_name": self.model.__class__.__name__,
+            "model_state_dict": self.model.state_dict(),
+            "optimizer_state_dict": self.model.opt.state_dict(),
         }
         torch.save(checkpoint, path)
 
@@ -713,48 +904,67 @@ def restore_by_pattern(self, pattern, log_dir, current_kfold_split=None):
         Therefore the split should not be part of the pattern.
         """
         if current_kfold_split is not None:
-            checkpoints = list(log_dir.glob(f"*{pattern}*_{current_kfold_split}_model_*.p*"))
+            checkpoints = list(
+                log_dir.glob(f"*{pattern}*_{current_kfold_split}_model_*.p*")
+            )
         else:
             checkpoints = list(log_dir.glob(f"*{pattern}*_model_*.p*"))
         last_mod_checkpoint = sorted(checkpoints, key=os.path.getmtime)[-1]
-        assert last_mod_checkpoint.is_file(), f"Couldn't restore by jobname: No model files matching <{pattern}> found."
+        assert (
+            last_mod_checkpoint.is_file()
+        ), f"Couldn't restore by jobname: No model files matching <{pattern}> found."
         return self.restore_model(last_mod_checkpoint)
 
     def restore_model(self, path):
         """loads and restores a model from file."""
         checkpoint = torch.load(path)
-        self.parent_run_id = checkpoint['run_id']
-        self.global_training_step = checkpoint['global_training_step']
-        self.current_epoch = checkpoint['epoch']
-
-        config_dict = checkpoint['config'] if isinstance(checkpoint['config'], dict) else checkpoint['config'].to_dict()
+        self.parent_run_id = checkpoint["run_id"]
+        self.global_training_step = checkpoint["global_training_step"]
+        self.current_epoch = checkpoint["epoch"]
+
+        config_dict = (
+            checkpoint["config"]
+            if isinstance(checkpoint["config"], dict)
+            else checkpoint["config"].to_dict()
+        )
 
-        if not self.args.get('--skip_restore_config'):
+        if not self.args.get("--skip_restore_config"):
             # maybe zero out dropout attributes
-            if self.args['--transfer'] is not None and self.args['--transfer_mode'] == 'frozen':
+            if (
+                self.args["--transfer"] is not None
+                and self.args["--transfer_mode"] == "frozen"
+            ):
                 for key, value in config_dict.items():
-                    if 'dropout' in key:
+                    if "dropout" in key:
                         config_dict[key] = 0.0
-                        print(f"*Restoring Config* Setting {key} from {value} to 0.0 while restoring config from checkpoint for transfer.")
-            config = getattr(configs, config_dict['name']).from_dict(config_dict)
+                        print(
+                            f"*Restoring Config* Setting {key} from {value} to 0.0 while restoring config from checkpoint for transfer."
+                        )
+            config = getattr(configs, config_dict["name"]).from_dict(config_dict)
             self.config = config
-            print(f'*RESTORED* self.config = {config.name} from checkpoint {str(path)}.')
+            print(
+                f"*RESTORED* self.config = {config.name} from checkpoint {str(path)}."
+            )
         else:
-            print(f'Skipped restoring self.config from checkpoint!')
-            assert self.args.get('--model') is not None, "Can only use --skip_restore_config if --model is given."
+            print(f"Skipped restoring self.config from checkpoint!")
+            assert (
+                self.args.get("--model") is not None
+            ), "Can only use --skip_restore_config if --model is given."
             # initialize config from --model and compare to skipped config from restore.
-            _, Config = MODEL_CLASSES[self.args['--model']]
+            _, Config = MODEL_CLASSES[self.args["--model"]]
             self.config = Config.from_dict(self.parse_config_params(args))
             self.config.check_equal(config_dict)
 
-        test_only = self.args.get('--test', False)
-        Model = getattr(modeling, checkpoint['model_name'])
+        test_only = self.args.get("--test", False)
+        Model = getattr(modeling, checkpoint["model_name"])
         model = Model(self.config, test_only=test_only)
-        model.load_state_dict(checkpoint['model_state_dict'])
-        print(f'*RESTORED* model parameters from checkpoint {str(path)}.')
-        if not self.args.get('--test', None):  # only restore opt if needed. opt should be None o/w.
-            model.opt.load_state_dict(checkpoint['optimizer_state_dict'])
-            print(f'*RESTORED* optimizer parameters from checkpoint as well.')
+        model.load_state_dict(checkpoint["model_state_dict"])
+        print(f"*RESTORED* model parameters from checkpoint {str(path)}.")
+        if not self.args.get(
+            "--test", None
+        ):  # only restore opt if needed. opt should be None o/w.
+            model.opt.load_state_dict(checkpoint["optimizer_state_dict"])
+            print(f"*RESTORED* optimizer parameters from checkpoint as well.")
         return model
 
     def transfer_model(self, transfer_model_class, mode):
@@ -770,7 +980,7 @@ def transfer_model(self, transfer_model_class, mode):
         self.current_epoch = 1
 
         # freeze layers
-        if mode == 'frozen':
+        if mode == "frozen":
             for param in self.model.parameters():
                 param.requires_grad = False
 
@@ -780,52 +990,78 @@ def transfer_model(self, transfer_model_class, mode):
         self.config = Config.from_dict(params=params)
 
         # replace readout
-        if getattr(self.config, 'has_aux_input', False) and getattr(self.config, 'aux_use_better', False):
+        if getattr(self.config, "has_aux_input", False) and getattr(
+            self.config, "aux_use_better", False
+        ):
             self.model.readout = modeling.BetterAuxiliaryReadout(self.config)
-        elif getattr(self.config, 'has_aux_input', False):
+        elif getattr(self.config, "has_aux_input", False):
             self.model.readout = modeling.Readout(self.config)
             self.model.aux_readout = modeling.AuxiliaryReadout(self.config)
         else:
-            assert not getattr(self.config, 'aux_use_better', False), 'aux_use_better only with has_aux_input!'
+            assert not getattr(
+                self.config, "aux_use_better", False
+            ), "aux_use_better only with has_aux_input!"
             self.model.readout = modeling.Readout(self.config)
 
         # assign config to model
         self.model.config = self.config
 
         # re-setup model
-        test_only = self.args.get('--test', False)
-        assert not test_only, "Why transfer if you don't train? Here is not restoring a transferred model!!!"
+        test_only = self.args.get("--test", False)
+        assert (
+            not test_only
+        ), "Why transfer if you don't train? Here is not restoring a transferred model!!!"
         self.model.setup(self.config, test_only)
         # print info
         print(self.model)
-        print(f"Number of trainable params in transferred model: {self.model.num_parameters()}")
+        print(
+            f"Number of trainable params in transferred model: {self.model.num_parameters()}"
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = docopt(__doc__)
     print(args)
-    assert not (args['--config'] and args['--config_json']), "Can't decide which config to use!"
-    if args.get('--model'):
-        assert args.get('--model') in MODEL_CLASSES, f'Unknown model.'
-    if args.get('--dataset'):
-        assert args.get('--dataset') in DATASET_CLASSES, f'Unknown dataset.'
-
-    if not args['--kfold']:
-        learner = Learner(model=args['--model'], dataset=args['--dataset'], args=args)
-        learner.test() if args.get('--test') else learner.train()
+    assert not (
+        args["--config"] and args["--config_json"]
+    ), "Can't decide which config to use!"
+    if args.get("--model"):
+        assert args.get("--model") in MODEL_CLASSES, f"Unknown model."
+    if args.get("--dataset"):
+        assert args.get("--dataset") in DATASET_CLASSES, f"Unknown dataset."
+
+    if not args["--kfold"]:
+        learner = Learner(model=args["--model"], dataset=args["--dataset"], args=args)
+        learner.test() if args.get("--test") else learner.train()
     else:  # kfold
-        if args['--dataset'] in ['devmap_amd', 'devmap_nvidia']: num_splits = 10
-        elif args['--dataset'] in ['threadcoarsening_Cypress', 'threadcoarsening_Kepler', 'threadcoarsening_Fermi', 'threadcoarsening_Tahiti']: num_splits = 17
-        elif args['--dataset'] in ['branch_prediction']: num_splits = 10
-        else: raise NotImplementedError("kfold not implemented for this dataset.")
+        if args["--dataset"] in ["devmap_amd", "devmap_nvidia"]:
+            num_splits = 10
+        elif args["--dataset"] in [
+            "threadcoarsening_Cypress",
+            "threadcoarsening_Kepler",
+            "threadcoarsening_Fermi",
+            "threadcoarsening_Tahiti",
+        ]:
+            num_splits = 17
+        elif args["--dataset"] in ["branch_prediction"]:
+            num_splits = 10
+        else:
+            raise NotImplementedError("kfold not implemented for this dataset.")
 
         for split in range(num_splits):
             print(f"#######################################")
             print(f"CURRENT SPLIT: {split} + 1/{num_splits}")
             print(f"#######################################")
-            learner = Learner(model=args['--model'], dataset=args['--dataset'], args=args, current_kfold_split=split)
+            learner = Learner(
+                model=args["--model"],
+                dataset=args["--dataset"],
+                args=args,
+                current_kfold_split=split,
+            )
             if len(learner.valid_data) == 0:
-                print('***'*20)
-                print(f'Validation Split is empty! Skipping split {split} + 1 / {num_splits}.')
-                print('***'*20)
-            learner.test() if args.get('--test') else learner.train()
+                print("***" * 20)
+                print(
+                    f"Validation Split is empty! Skipping split {split} + 1 / {num_splits}."
+                )
+                print("***" * 20)
+            learner.test() if args.get("--test") else learner.train()

From 385b2be85f058c62098a406a57fbdb1603b765ad Mon Sep 17 00:00:00 2001
From: Chris Cummins <chrisc.101@gmail.com>
Date: Thu, 27 Aug 2020 12:11:21 +0100
Subject: [PATCH 4/5] Remove superfluous gitignore.

We write logs into the dataset directory, not in the source tree.

github.com/ChrisCummins/ProGraML/issues/81
---
 programl/task/graph_level_classification/.gitignore | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 programl/task/graph_level_classification/.gitignore

diff --git a/programl/task/graph_level_classification/.gitignore b/programl/task/graph_level_classification/.gitignore
deleted file mode 100644
index 9aa91af40..000000000
--- a/programl/task/graph_level_classification/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# don't track logs folder
-logs/
\ No newline at end of file

From 2876ad5cf0cba2ab52c8b34fafa68a0af50c9d0d Mon Sep 17 00:00:00 2001
From: Zacharias Fisches <zacharias.vf@gmail.com>
Date: Thu, 27 Aug 2020 17:49:13 +0200
Subject: [PATCH 5/5] example run command

---
 programl/task/graph_level_classification/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/programl/task/graph_level_classification/README.md b/programl/task/graph_level_classification/README.md
index 2c5d804e3..5bdd6d7e5 100644
--- a/programl/task/graph_level_classification/README.md
+++ b/programl/task/graph_level_classification/README.md
@@ -39,7 +39,9 @@ Options:
 ```
 Therefore, an exemplary command could look like this:
 ```
-[Example Run command]
+Reproduce the Transformer result for the rebuttal:
+
+python run.py --model transformer_poj104 --dataset poj104 --data_dir ~/rebuttal_datasets/classifyapp/ --log_dir logs/classifyapp_logs/rebuttal_transformer_poj104/ --config_json="{'train_subset': [0, 100], 'batch_size': 48, 'max_num_nodes': 40000, 'num_epochs': 70, 'vocab_size': 2231, 'message_weight_sharing': 2, 'update_weight_sharing': 2, 'lr': 1e-4, 'gnn_layers': 10}" 
 ```
 NB: You can pass a double quoted string of config options in json format, except that you may use single quotes (they will be parsed as double quotes to transform this almost-json format into valid json)