|
73 | 73 | SUBJECT_LABEL, |
74 | 74 | SUBJECT_SOURCE, |
75 | 75 | ) |
76 | | -from .context import SSSOM_URI_PREFIX, get_default_metadata, get_jsonld_context |
| 76 | +from .context import ( |
| 77 | + SSSOM_BUILT_IN_PREFIXES, |
| 78 | + SSSOM_URI_PREFIX, |
| 79 | + get_default_metadata, |
| 80 | + get_jsonld_context, |
| 81 | +) |
77 | 82 | from .sssom_document import MappingSetDocument |
78 | 83 | from .typehints import Metadata, MetadataType, PrefixMap |
79 | 84 |
|
@@ -145,15 +150,22 @@ def __str__(self) -> str: # noqa:D105 |
145 | 150 |
|
146 | 151 | def clean_prefix_map(self) -> None: |
147 | 152 | """Remove unused prefixes from the internal prefix map based on the internal dataframe.""" |
148 | | - prefixes_in_map = get_prefixes_used_in_table(self.df) |
| 153 | + all_prefixes = [] |
| 154 | + prefixes_in_table = get_prefixes_used_in_table(self.df) |
| 155 | + if self.metadata: |
| 156 | + prefixes_in_metadata = get_prefixes_used_in_metadata(self.metadata) |
| 157 | + all_prefixes = list(set(prefixes_in_table + prefixes_in_metadata)) |
| 158 | + else: |
| 159 | + all_prefixes = prefixes_in_table |
| 160 | + |
149 | 161 | new_prefixes: PrefixMap = dict() |
150 | 162 | missing_prefixes = [] |
151 | | - for prefix in prefixes_in_map: |
| 163 | + for prefix in all_prefixes: |
152 | 164 | if prefix in self.prefix_map: |
153 | 165 | new_prefixes[prefix] = self.prefix_map[prefix] |
154 | 166 | else: |
155 | 167 | logging.warning( |
156 | | - f"{prefix} is used in the data frame but does not exist in prefix map" |
| 168 | + f"{prefix} is used in the SSSOM mapping set but it does not exist in the prefix map" |
157 | 169 | ) |
158 | 170 | missing_prefixes.append(prefix) |
159 | 171 | if missing_prefixes: |
@@ -1086,11 +1098,34 @@ def curie_from_uri(uri: str, prefix_map: Mapping[str, str]) -> str: |
1086 | 1098 |
|
1087 | 1099 | def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]: |
1088 | 1100 | """Get a list of prefixes used in CURIEs in key feature columns in a dataframe.""" |
1089 | | - prefixes = [] |
| 1101 | + prefixes = SSSOM_BUILT_IN_PREFIXES |
1090 | 1102 | if not df.empty: |
1091 | | - for col in KEY_FEATURES: |
1092 | | - for v in df[col].values: |
1093 | | - prefixes.append(get_prefix_from_curie(v)) |
| 1103 | + for col in ENTITY_REFERENCE_SLOTS: |
| 1104 | + if col in df.columns: |
| 1105 | + for v in df[col].values: |
| 1106 | + pref = get_prefix_from_curie(str(v)) |
| 1107 | + if pref != "" and not None: |
| 1108 | + prefixes.append(pref) |
| 1109 | + return list(set(prefixes)) |
| 1110 | + |
| 1111 | + |
| 1112 | +def get_prefixes_used_in_metadata(meta: MetadataType) -> List[str]: |
| 1113 | + """Get a list of prefixes used in CURIEs in the metadata.""" |
| 1114 | + prefixes = SSSOM_BUILT_IN_PREFIXES |
| 1115 | + if meta: |
| 1116 | + for v in meta.values(): |
| 1117 | + if type(v) is list: |
| 1118 | + prefixes.extend( |
| 1119 | + [ |
| 1120 | + get_prefix_from_curie(x) |
| 1121 | + for x in v |
| 1122 | + if get_prefix_from_curie(x) != "" |
| 1123 | + ] |
| 1124 | + ) |
| 1125 | + else: |
| 1126 | + pref = get_prefix_from_curie(str(v)) |
| 1127 | + if pref != "" and not None: |
| 1128 | + prefixes.append(pref) |
1094 | 1129 | return list(set(prefixes)) |
1095 | 1130 |
|
1096 | 1131 |
|
@@ -1374,6 +1409,7 @@ def get_all_prefixes(msdf: MappingSetDataFrame) -> list: |
1374 | 1409 | [ |
1375 | 1410 | get_prefix_from_curie(s) |
1376 | 1411 | for s in list(set(msdf.df[slot].to_list())) # type: ignore |
| 1412 | + if get_prefix_from_curie(s) != "" |
1377 | 1413 | ] |
1378 | 1414 | ) |
1379 | 1415 | ) |
|
0 commit comments