Merge pull request #54 from ChEB-AI/refactor_chebiOverXPartial

sfluegel05 · web-flow · commit bf9e642918f9 · 2024-10-01T13:31:57.000+02:00
Refactor ChEBIOverXPartial
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -14,6 +14,7 @@
 )
 from lightning.pytorch.core.datamodule import LightningDataModule
 from lightning_utilities.core.rank_zero import rank_zero_info
+from sklearn.model_selection import StratifiedShuffleSplit
 from torch.utils.data import DataLoader
 
 from chebai.preprocessing import reader as dr
@@ -929,11 +930,17 @@ def get_test_split(
         labels_list = df["labels"].tolist()
 
         test_size = 1 - self.train_split - (1 - self.train_split) ** 2
-        msss = MultilabelStratifiedShuffleSplit(
-            n_splits=1, test_size=test_size, random_state=seed
-        )
 
-        train_indices, test_indices = next(msss.split(labels_list, labels_list))
+        if len(labels_list[0]) > 1:
+            splitter = MultilabelStratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
+        else:
+            splitter = StratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
+
+        train_indices, test_indices = next(splitter.split(labels_list, labels_list))
 
         df_train = df.iloc[train_indices]
         df_test = df.iloc[test_indices]
@@ -985,12 +992,18 @@ def get_train_val_splits_given_test(
 
         # scale val set size by 1/self.train_split to compensate for (hypothetical) test set size (1-self.train_split)
         test_size = ((1 - self.train_split) ** 2) / self.train_split
-        msss = MultilabelStratifiedShuffleSplit(
-            n_splits=1, test_size=test_size, random_state=seed
-        )
+
+        if len(labels_list_trainval[0]) > 1:
+            splitter = MultilabelStratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
+        else:
+            splitter = StratifiedShuffleSplit(
+                n_splits=1, test_size=test_size, random_state=seed
+            )
 
         train_indices, validation_indices = next(
-            msss.split(labels_list_trainval, labels_list_trainval)
+            splitter.split(labels_list_trainval, labels_list_trainval)
         )
 
         df_validation = df_trainval.iloc[validation_indices]
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -736,6 +736,9 @@ def __init__(self, top_class_id: int, **kwargs):
             top_class_id (int): The ID of the top class from which to extract subclasses.
             **kwargs: Additional keyword arguments passed to the superclass initializer.
         """
+        if "top_class_id" not in kwargs:
+            kwargs["top_class_id"] = top_class_id
+
         self.top_class_id: int = top_class_id
         super().__init__(**kwargs)
 
@@ -758,27 +761,18 @@ def _extract_class_hierarchy(self, chebi_path: str) -> nx.DiGraph:
         """
         Extracts a subset of ChEBI based on subclasses of the top class ID.
 
+        This method calls the superclass method to extract the full class hierarchy,
+        then extracts the subgraph containing only the descendants of the top class ID, including itself.
+
         Args:
             chebi_path (str): The file path to the ChEBI ontology file.
 
         Returns:
-            nx.DiGraph: The extracted class hierarchy as a directed graph.
+            nx.DiGraph: The extracted class hierarchy as a directed graph, limited to the
+            descendants of the top class ID.
         """
-        with open(chebi_path, encoding="utf-8") as chebi:
-            chebi = "\n".join(l for l in chebi if not l.startswith("xref:"))
-        elements = [
-            term_callback(clause)
-            for clause in fastobo.loads(chebi)
-            if clause and ":" in str(clause.id)
-        ]
-        g = nx.DiGraph()
-        for n in elements:
-            g.add_node(n["id"], **n)
-        g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]])
-
-        g = nx.transitive_closure_dag(g)
-        g = g.subgraph(list(nx.descendants(g, self.top_class_id)) + [self.top_class_id])
-        print("Compute transitive closure")
+        g = super()._extract_class_hierarchy(chebi_path)
+        g = g.subgraph(list(g.successors(self.top_class_id)) + [self.top_class_id])
         return g