Skip to content

Commit 0e7cef7

Browse files
committed
Optimize split_dataframe_by_prefix
In the previous implementation of the function, the entire MappingSetDataFrame was being iterated over many more times than necessary. This changes it to only go through once. All logging and output remains the same.
1 parent 3721439 commit 0e7cef7

File tree

1 file changed

+58
-25
lines changed

1 file changed

+58
-25
lines changed

src/sssom/parsers.py

Lines changed: 58 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,22 @@
77
import logging as _logging
88
import os.path
99
import typing
10-
from collections import ChainMap, Counter
10+
from collections import ChainMap, Counter, defaultdict
1111
from pathlib import Path
12-
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
12+
from typing import (
13+
Any,
14+
Callable,
15+
DefaultDict,
16+
Dict,
17+
Iterable,
18+
List,
19+
NamedTuple,
20+
Optional,
21+
TextIO,
22+
Tuple,
23+
Union,
24+
cast,
25+
)
1326
from xml.dom import Node, minidom
1427
from xml.dom.minidom import Document
1528

@@ -18,7 +31,7 @@
1831
import pandas as pd
1932
import requests
2033
import yaml
21-
from curies import Converter
34+
from curies import Converter, ReferenceTuple
2235
from linkml_runtime.loaders.json_loader import JSONLoader
2336
from linkml_runtime.loaders.rdflib_loader import RDFLibLoader
2437
from pandas.errors import EmptyDataError
@@ -977,9 +990,12 @@ def split_dataframe(
977990
)
978991

979992

980-
def _get_split_key(subject_prefix: str, relation_luid: str, object_prefix: str) -> str:
981-
split = f"{subject_prefix.lower()}_{relation_luid.lower()}_{object_prefix.lower()}"
982-
return split
993+
class SSSOMSplitGroup(NamedTuple):
994+
"""The key of a group of mappings in a split MappingSetDataFrame."""
995+
996+
subject_prefix: str
997+
object_prefix: str
998+
relation_tup: ReferenceTuple
983999

9841000

9851001
def split_dataframe_by_prefix(
@@ -996,34 +1012,51 @@ def split_dataframe_by_prefix(
9961012
:param relations: a list of relations of interest
9971013
:return: a dict of SSSOM data frame names to MappingSetDataFrame
9981014
"""
999-
df = msdf.df
10001015
meta = msdf.metadata
10011016
split_to_msdf: Dict[str, MappingSetDataFrame] = {}
1002-
for subject_prefix, object_prefix, relation in itt.product(
1003-
subject_prefixes, object_prefixes, relations
1004-
):
1005-
relation_prefix, relation_id = relation.split(":")
1006-
split = _get_split_key(subject_prefix, relation_id, object_prefix)
1007-
if subject_prefix not in msdf.converter.bimap:
1008-
logging.warning(f"{split} - missing subject prefix - {subject_prefix}")
1017+
mappings_by_group: DefaultDict[SSSOMSplitGroup, List[object]] = defaultdict(list)
1018+
parse_curie = msdf.converter.parse_curie
1019+
1020+
expected_split_groups = [
1021+
SSSOMSplitGroup(
1022+
subject_prefix,
1023+
object_prefix,
1024+
parse_curie(relation, strict=True),
1025+
)
1026+
for subject_prefix, relation, object_prefix in itt.product(
1027+
subject_prefixes, relations, object_prefixes
1028+
)
1029+
]
1030+
1031+
for mapping in msdf.df.itertuples(index=False):
1032+
group = SSSOMSplitGroup(
1033+
parse_curie(getattr(mapping, SUBJECT_ID), strict=True).prefix,
1034+
parse_curie(getattr(mapping, OBJECT_ID), strict=True).prefix,
1035+
parse_curie(getattr(mapping, PREDICATE_ID), strict=True),
1036+
)
1037+
mappings_by_group[group].append(mapping)
1038+
1039+
for group in expected_split_groups:
1040+
split = f"{group.subject_prefix.lower()}_{group.relation_tup.identifier.lower()}_{group.object_prefix.lower()}"
1041+
mappings = mappings_by_group.get(group, None)
1042+
1043+
if group.subject_prefix not in msdf.converter.bimap:
1044+
logging.warning(f"{split} - missing subject prefix - {group.subject_prefix}")
10091045
continue
1010-
if object_prefix not in msdf.converter.bimap:
1011-
logging.warning(f"{split} - missing object prefix - {object_prefix}")
1046+
elif group.object_prefix not in msdf.converter.bimap:
1047+
logging.warning(f"{split} - missing object prefix - {group.object_prefix}")
10121048
continue
1013-
df_subset = df[
1014-
(df[SUBJECT_ID].str.startswith(subject_prefix + ":"))
1015-
& (df[PREDICATE_ID] == relation)
1016-
& (df[OBJECT_ID].str.startswith(object_prefix + ":"))
1017-
]
1018-
if 0 == len(df_subset):
1019-
logging.debug(f"No matches ({len(df_subset)} matches found)")
1049+
elif mappings is None:
1050+
logging.debug(f"{split} - No matches matches found")
10201051
continue
1052+
10211053
subconverter = msdf.converter.get_subconverter(
1022-
[subject_prefix, object_prefix, relation_prefix]
1054+
[group.subject_prefix, group.object_prefix, group.relation_tup.prefix]
10231055
)
10241056
split_to_msdf[split] = from_sssom_dataframe(
1025-
df_subset, prefix_map=dict(subconverter.bimap), meta=meta
1057+
pd.DataFrame(mappings), prefix_map=dict(subconverter.bimap), meta=meta
10261058
)
1059+
10271060
return split_to_msdf
10281061

10291062

0 commit comments

Comments
 (0)