77import logging as _logging
88import os .path
99import typing
10- from collections import ChainMap , Counter
10+ from collections import ChainMap , Counter , defaultdict
1111from pathlib import Path
12- from typing import Any , Callable , Dict , Iterable , List , Optional , TextIO , Tuple , Union , cast
12+ from typing import (
13+ Any ,
14+ Callable ,
15+ DefaultDict ,
16+ Dict ,
17+ Iterable ,
18+ List ,
19+ NamedTuple ,
20+ Optional ,
21+ TextIO ,
22+ Tuple ,
23+ Union ,
24+ cast ,
25+ )
1326from xml .dom import Node , minidom
1427from xml .dom .minidom import Document
1528
1831import pandas as pd
1932import requests
2033import yaml
21- from curies import Converter
34+ from curies import Converter , ReferenceTuple
2235from linkml_runtime .loaders .json_loader import JSONLoader
2336from linkml_runtime .loaders .rdflib_loader import RDFLibLoader
2437from pandas .errors import EmptyDataError
@@ -977,9 +990,12 @@ def split_dataframe(
977990 )
978991
979992
980- def _get_split_key (subject_prefix : str , relation_luid : str , object_prefix : str ) -> str :
981- split = f"{ subject_prefix .lower ()} _{ relation_luid .lower ()} _{ object_prefix .lower ()} "
982- return split
993+ class SSSOMSplitGroup (NamedTuple ):
994+ """The key of a group of mappings in a split MappingSetDataFrame."""
995+
996+ subject_prefix : str
997+ object_prefix : str
998+ relation_tup : ReferenceTuple
983999
9841000
9851001def split_dataframe_by_prefix (
@@ -996,34 +1012,51 @@ def split_dataframe_by_prefix(
9961012 :param relations: a list of relations of interest
9971013 :return: a dict of SSSOM data frame names to MappingSetDataFrame
9981014 """
999- df = msdf .df
10001015 meta = msdf .metadata
10011016 split_to_msdf : Dict [str , MappingSetDataFrame ] = {}
1002- for subject_prefix , object_prefix , relation in itt .product (
1003- subject_prefixes , object_prefixes , relations
1004- ):
1005- relation_prefix , relation_id = relation .split (":" )
1006- split = _get_split_key (subject_prefix , relation_id , object_prefix )
1007- if subject_prefix not in msdf .converter .bimap :
1008- logging .warning (f"{ split } - missing subject prefix - { subject_prefix } " )
1017+ mappings_by_group : DefaultDict [SSSOMSplitGroup , List [object ]] = defaultdict (list )
1018+ parse_curie = msdf .converter .parse_curie
1019+
1020+ expected_split_groups = [
1021+ SSSOMSplitGroup (
1022+ subject_prefix ,
1023+ object_prefix ,
1024+ parse_curie (relation , strict = True ),
1025+ )
1026+ for subject_prefix , relation , object_prefix in itt .product (
1027+ subject_prefixes , relations , object_prefixes
1028+ )
1029+ ]
1030+
1031+ for mapping in msdf .df .itertuples (index = False ):
1032+ group = SSSOMSplitGroup (
1033+ parse_curie (getattr (mapping , SUBJECT_ID ), strict = True ).prefix ,
1034+ parse_curie (getattr (mapping , OBJECT_ID ), strict = True ).prefix ,
1035+ parse_curie (getattr (mapping , PREDICATE_ID ), strict = True ),
1036+ )
1037+ mappings_by_group [group ].append (mapping )
1038+
1039+ for group in expected_split_groups :
1040+ split = f"{ group .subject_prefix .lower ()} _{ group .relation_tup .identifier .lower ()} _{ group .object_prefix .lower ()} "
1041+ mappings = mappings_by_group .get (group , None )
1042+
1043+ if group .subject_prefix not in msdf .converter .bimap :
1044+ logging .warning (f"{ split } - missing subject prefix - { group .subject_prefix } " )
10091045 continue
1010- if object_prefix not in msdf .converter .bimap :
1011- logging .warning (f"{ split } - missing object prefix - { object_prefix } " )
1046+ elif group . object_prefix not in msdf .converter .bimap :
1047+ logging .warning (f"{ split } - missing object prefix - { group . object_prefix } " )
10121048 continue
1013- df_subset = df [
1014- (df [SUBJECT_ID ].str .startswith (subject_prefix + ":" ))
1015- & (df [PREDICATE_ID ] == relation )
1016- & (df [OBJECT_ID ].str .startswith (object_prefix + ":" ))
1017- ]
1018- if 0 == len (df_subset ):
1019- logging .debug (f"No matches ({ len (df_subset )} matches found)" )
1049+ elif mappings is None :
1050+ logging .debug (f"{ split } - No matches matches found" )
10201051 continue
1052+
10211053 subconverter = msdf .converter .get_subconverter (
1022- [subject_prefix , object_prefix , relation_prefix ]
1054+ [group . subject_prefix , group . object_prefix , group . relation_tup . prefix ]
10231055 )
10241056 split_to_msdf [split ] = from_sssom_dataframe (
1025- df_subset , prefix_map = dict (subconverter .bimap ), meta = meta
1057+ pd . DataFrame ( mappings ) , prefix_map = dict (subconverter .bimap ), meta = meta
10261058 )
1059+
10271060 return split_to_msdf
10281061
10291062
0 commit comments