|
13 | 13 | import pickle |
14 | 14 | from abc import ABC |
15 | 15 | from collections import OrderedDict |
16 | | -from typing import Any, Dict, Generator, List, Optional, Tuple |
| 16 | +from typing import Any, Dict, Generator, List, Optional, Tuple, Union |
17 | 17 |
|
18 | 18 | import fastobo |
19 | 19 | import networkx as nx |
@@ -244,16 +244,26 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: |
244 | 244 | with open(data_path, encoding="utf-8") as chebi: |
245 | 245 | chebi = "\n".join(l for l in chebi if not l.startswith("xref:")) |
246 | 246 |
|
247 | | - elements = [ |
248 | | - term_callback(clause) |
249 | | - for clause in fastobo.loads(chebi) |
250 | | - if clause and ":" in str(clause.id) |
251 | | - ] |
| 247 | + elements = [] |
| 248 | + for term_doc in fastobo.loads(chebi): |
| 249 | + if ( |
| 250 | + term_doc |
| 251 | + and isinstance(term_doc.id, fastobo.id.PrefixedIdent) |
| 252 | + and term_doc.id.prefix == "CHEBI" |
| 253 | + ): |
| 254 | + term_dict = term_callback(term_doc) |
| 255 | + if term_dict: |
| 256 | + elements.append(term_dict) |
252 | 257 |
|
253 | 258 | g = nx.DiGraph() |
254 | 259 | for n in elements: |
255 | 260 | g.add_node(n["id"], **n) |
256 | | - g.add_edges_from([(p, q["id"]) for q in elements for p in q["parents"]]) |
| 261 | + |
| 262 | + # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes |
| 263 | + # https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142 |
| 264 | + g.add_edges_from( |
| 265 | + [(p, q["id"]) for q in elements for p in q["parents"] if g.has_node(p)] |
| 266 | + ) |
257 | 267 |
|
258 | 268 | print("Compute transitive closure") |
259 | 269 | return nx.transitive_closure_dag(g) |
@@ -812,7 +822,7 @@ def chebi_to_int(s: str) -> int: |
812 | 822 | return int(s[s.index(":") + 1 :]) |
813 | 823 |
|
814 | 824 |
|
815 | | -def term_callback(doc) -> dict: |
| 825 | +def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]: |
816 | 826 | """ |
817 | 827 | Extracts information from a ChEBI term document. |
818 | 828 | This function takes a ChEBI term document as input and extracts relevant information such as the term ID, parents, |
@@ -852,6 +862,12 @@ def term_callback(doc) -> dict: |
852 | 862 | parents.append(chebi_to_int(str(clause.term))) |
853 | 863 | elif isinstance(clause, fastobo.term.NameClause): |
854 | 864 | name = str(clause.name) |
| 865 | + |
| 866 | + if isinstance(clause, fastobo.term.IsObsoleteClause): |
| 867 | + if clause.obsolete: |
| 868 | + # if the term document contains clause as obsolete as true, skips this document. |
| 869 | + return False |
| 870 | + |
855 | 871 | return { |
856 | 872 | "id": chebi_to_int(str(doc.id)), |
857 | 873 | "parents": parents, |
|
0 commit comments