Skip to content

Commit 07f05cd

Browse files
authored
Fix msdf_merge util function. (#627)
The `msdf_merge` function attempts to "propagate" slots before merging the sets, but is doing so without any regard for which slots should actually be propagated. In addition, it attempts to inject in every individual mapping a `mapping_set_source` slot pointing to the ID of the original set that contained the mapping, but this is invalid as there is _no_ `mapping_set_source` slot on indivdual mapping records -- the slot intended to capture the set from which a record came from is `mapping_source`. Lastly, the function also attempts to drop duplicates after the sets have been merged, but the detection of duplicates is prevented by (1) the incorrect propagation of non-propagatable slots (which can cause two otherwise identical records in two different sets to appear different, if the metadata of the sets contain different wrongly propagated slots), and (2) the injection of the `mapping_set_source` slot. This commit fixes all those issues by deleting the bogus `inject_metadata_into_df` function and replacing by a call to `msdf.propagate()`, which implements propagation correctly. It then manually inject the correct `mapping_source` slot if possible, and if so ignore the injected slot when attempting to drop duplicates.
1 parent 76165d5 commit 07f05cd

File tree

4 files changed

+26
-46
lines changed

4 files changed

+26
-46
lines changed

src/sssom/util.py

Lines changed: 24 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
MAPPING_CARDINALITY,
4848
MAPPING_JUSTIFICATION,
4949
MAPPING_SET_ID,
50-
MAPPING_SET_SOURCE,
50+
MAPPING_SOURCE,
5151
NO_TERM_FOUND,
5252
OBJECT_CATEGORY,
5353
OBJECT_ID,
@@ -1118,25 +1118,35 @@ def merge_msdf(
11181118
:param reconcile: If reconcile=True, then dedupe(remove redundant lower confidence mappings) and
11191119
reconcile (if msdf contains a higher confidence _negative_ mapping, then remove lower
11201120
confidence positive one. If confidence is the same, prefer HumanCurated. If both
1121-
HumanCurated, prefer negative mapping). Defaults to True.
1121+
HumanCurated, prefer negative mapping). Defaults to False.
11221122
11231123
:returns: Merged MappingSetDataFrame.
11241124
"""
1125-
# Inject metadata of msdf into df
1126-
msdf_with_meta = [inject_metadata_into_df(msdf) for msdf in msdfs]
1127-
1128-
# merge df [# 'outer' join in pandas == FULL JOIN in SQL]
1129-
# df_merged = reduce(
1130-
# lambda left, right: left.merge(right, how="outer", on=list(left.columns)),
1131-
# [msdf.df for msdf in msdf_with_meta],
1132-
# )
1133-
# Concat is an alternative to merge when columns are not the same.
1125+
# Propagate slots, inject source if possible
1126+
source_injected = 0
1127+
for msdf in msdfs:
1128+
msdf.propagate()
1129+
if MAPPING_SET_ID in msdf.metadata and MAPPING_SOURCE not in msdf.df.columns:
1130+
msdf.df[MAPPING_SOURCE] = msdf.metadata[MAPPING_SET_ID]
1131+
source_injected += 1
1132+
1133+
columns = set([c for msdf in msdfs for c in msdf.df.columns])
1134+
if source_injected > 1:
1135+
# If we injected a mapping_source slot into each individual
1136+
# record for at least two of the input sets, then we must ignore
1137+
# that slot when attempting to remove duplicates below, because
1138+
# the mere presence of that slot would cause two identical
1139+
# records to appear different just because they come from
1140+
# different sources (which they would not do if we had not
1141+
# injected the mapping_source above).
1142+
columns.remove(MAPPING_SOURCE)
1143+
11341144
df_merged = reduce(
11351145
lambda left, right: pd.concat([left, right], axis=0, ignore_index=True),
1136-
[msdf.df for msdf in msdf_with_meta],
1137-
).drop_duplicates(ignore_index=True)
1146+
[msdf.df for msdf in msdfs],
1147+
).drop_duplicates(ignore_index=True, subset=columns)
11381148

1139-
converter = curies.chain([msdf.converter for msdf in msdf_with_meta])
1149+
converter = curies.chain([msdf.converter for msdf in msdfs])
11401150
merged_msdf = MappingSetDataFrame.with_converter(df=df_merged, converter=converter)
11411151
if reconcile:
11421152
merged_msdf.df = filter_redundant_rows(merged_msdf.df)
@@ -1296,27 +1306,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
12961306
return return_df
12971307

12981308

1299-
def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
1300-
"""Inject metadata dictionary key-value pair into DataFrame columns in a MappingSetDataFrame.DataFrame.
1301-
1302-
:param msdf: MappingSetDataFrame with metadata separate.
1303-
1304-
:returns: MappingSetDataFrame with metadata as columns
1305-
"""
1306-
# TODO add this into the "standardize" function introduced in
1307-
# https://github.com/mapping-commons/sssom-py/pull/438
1308-
# TODO Check if 'k' is a valid 'slot' for 'mapping' [sssom.yaml]
1309-
slots = SSSOMSchemaView().mapping_slots
1310-
for k, v in msdf.metadata.items():
1311-
if k not in msdf.df.columns and k in slots:
1312-
if k == MAPPING_SET_ID:
1313-
k = MAPPING_SET_SOURCE
1314-
if isinstance(v, list):
1315-
v = "|".join(x for x in v)
1316-
msdf.df[k] = str(v)
1317-
return msdf
1318-
1319-
13201309
ExtensionLiteral = Literal["tsv", "csv"]
13211310

13221311

tests/test_merge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def setUp(self) -> None:
2121
def test_merge_multiple_inputs(self) -> None:
2222
"""Test merging of multiple msdfs."""
2323
merged_msdf = merge_msdf(*self.msdfs)
24-
self.assertEqual(275, len(merged_msdf.df))
24+
self.assertEqual(200, len(merged_msdf.df))
2525

2626
def test_merge_single_input(self) -> None:
2727
"""Test merging when a single msdf is provided."""

tests/test_reconcile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_merge(self) -> None:
4141
msdf3 = parse_sssom_table(data_dir / "basic.tsv")
4242
merged_msdf1 = merge_msdf(self.msdf1, msdf3)
4343

44-
self.assertEqual(152, len(merged_msdf1.df))
44+
self.assertEqual(149, len(merged_msdf1.df))
4545

4646
merged_msdf2 = merge_msdf(self.msdf2, msdf3)
4747
self.assertEqual(174, len(merged_msdf2.df))

tests/test_utils.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
get_dict_from_mapping,
3939
get_file_extension,
4040
get_prefixes_used_in_table,
41-
inject_metadata_into_df,
4241
invert_mappings,
4342
is_multivalued_slot,
4443
)
@@ -213,14 +212,6 @@ def test_invert_asymmetric_nodes(self) -> None:
213212
inverted_object_labels = inverted_df["object_label"].values
214213
self.assertNotIn(False, original_subject_labels == inverted_object_labels)
215214

216-
def test_inject_metadata_into_df(self) -> None:
217-
"""Test injecting metadata into DataFrame is as expected."""
218-
expected_creators = "orcid:0000-0001-5839-2535|orcid:0000-0001-5839-2532"
219-
msdf = parse_sssom_table(f"{data_dir}/test_inject_metadata_msdf.tsv")
220-
msdf_with_meta = inject_metadata_into_df(msdf)
221-
creator_ids = msdf_with_meta.df["creator_id"].drop_duplicates().values.item()
222-
self.assertEqual(creator_ids, expected_creators)
223-
224215

225216
class TestUtils(unittest.TestCase):
226217
"""Unit test for utility functions."""

0 commit comments

Comments
 (0)