Skip to content

Commit e17a9c0

Browse files
authored
Merge pull request #55 from ChEB-AI/refactor_term_callback
Refactor Chebi Term Callback
2 parents bf9e642 + 582b528 commit e17a9c0

File tree

1 file changed

+24
-8
lines changed

1 file changed

+24
-8
lines changed

chebai/preprocessing/datasets/chebi.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import pickle
1414
from abc import ABC
1515
from collections import OrderedDict
16-
from typing import Any, Dict, Generator, List, Optional, Tuple
16+
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
1717

1818
import fastobo
1919
import networkx as nx
@@ -244,16 +244,26 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
244244
with open(data_path, encoding="utf-8") as chebi:
245245
chebi = "\n".join(l for l in chebi if not l.startswith("xref:"))
246246

247-
elements = [
248-
term_callback(clause)
249-
for clause in fastobo.loads(chebi)
250-
if clause and ":" in str(clause.id)
251-
]
247+
elements = []
248+
for term_doc in fastobo.loads(chebi):
249+
if (
250+
term_doc
251+
and isinstance(term_doc.id, fastobo.id.PrefixedIdent)
252+
and term_doc.id.prefix == "CHEBI"
253+
):
254+
term_dict = term_callback(term_doc)
255+
if term_dict:
256+
elements.append(term_dict)
252257

253258
g = nx.DiGraph()
254259
for n in elements:
255260
g.add_node(n["id"], **n)
256-
g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]])
261+
262+
# Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes
263+
# https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142
264+
g.add_edges_from(
265+
[(p, q["id"]) for q in elements for p in q["parents"] if g.has_node(p)]
266+
)
257267

258268
print("Compute transitive closure")
259269
return nx.transitive_closure_dag(g)
@@ -812,7 +822,7 @@ def chebi_to_int(s: str) -> int:
812822
return int(s[s.index(":") + 1 :])
813823

814824

815-
def term_callback(doc) -> dict:
825+
def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]:
816826
"""
817827
Extracts information from a ChEBI term document.
818828
This function takes a ChEBI term document as input and extracts relevant information such as the term ID, parents,
@@ -852,6 +862,12 @@ def term_callback(doc) -> dict:
852862
parents.append(chebi_to_int(str(clause.term)))
853863
elif isinstance(clause, fastobo.term.NameClause):
854864
name = str(clause.name)
865+
866+
if isinstance(clause, fastobo.term.IsObsoleteClause):
867+
if clause.obsolete:
868+
# if the term document contains clause as obsolete as true, skips this document.
869+
return False
870+
855871
return {
856872
"id": chebi_to_int(str(doc.id)),
857873
"parents": parents,

0 commit comments

Comments
 (0)