|
50 | 50 | OBJECT_ID, |
51 | 51 | OBJECT_LABEL, |
52 | 52 | OBJECT_SOURCE, |
| 53 | + OBO_HAS_DB_XREF, |
| 54 | + OWL_DIFFERENT_FROM, |
| 55 | + OWL_EQUIVALENT_CLASS, |
53 | 56 | PREDICATE_ID, |
| 57 | + PREDICATE_LIST, |
54 | 58 | PREDICATE_MODIFIER, |
55 | 59 | PREDICATE_MODIFIER_NOT, |
56 | 60 | PREFIX_MAP_MODES, |
| 61 | + RDFS_SUBCLASS_OF, |
57 | 62 | SCHEMA_DICT, |
58 | 63 | SCHEMA_YAML, |
59 | 64 | SEMAPV, |
| 65 | + SKOS_BROAD_MATCH, |
| 66 | + SKOS_CLOSE_MATCH, |
| 67 | + SKOS_EXACT_MATCH, |
| 68 | + SKOS_NARROW_MATCH, |
| 69 | + SKOS_RELATED_MATCH, |
| 70 | + SSSOM_SUPERCLASS_OF, |
60 | 71 | SUBJECT_CATEGORY, |
61 | 72 | SUBJECT_ID, |
62 | 73 | SUBJECT_LABEL, |
@@ -289,13 +300,83 @@ def filter_redundant_rows( |
289 | 300 | # will be removed from pandas in a future version. |
290 | 301 | # Use pandas.concat instead. |
291 | 302 | # return_df = df.append(nan_df).drop_duplicates() |
292 | | - return_df = pd.concat([df, nan_df]).drop_duplicates() |
| 303 | + confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates() |
| 304 | + |
| 305 | + # Reconciling dataframe rows based on the predicates with equal confidence. |
| 306 | + if PREDICATE_MODIFIER in confidence_reconciled_df.columns: |
| 307 | + tmp_df = confidence_reconciled_df[ |
| 308 | + [SUBJECT_ID, OBJECT_ID, PREDICATE_ID, CONFIDENCE, PREDICATE_MODIFIER] |
| 309 | + ] |
| 310 | + tmp_df = tmp_df[tmp_df[PREDICATE_MODIFIER] != PREDICATE_MODIFIER_NOT].drop( |
| 311 | + PREDICATE_MODIFIER, axis=1 |
| 312 | + ) |
| 313 | + else: |
| 314 | + tmp_df = confidence_reconciled_df[ |
| 315 | + [SUBJECT_ID, OBJECT_ID, PREDICATE_ID, CONFIDENCE] |
| 316 | + ] |
| 317 | + tmp_df_grp = tmp_df.groupby( |
| 318 | + [SUBJECT_ID, OBJECT_ID, CONFIDENCE], as_index=False |
| 319 | + ).count() |
| 320 | + tmp_df_grp = tmp_df_grp[tmp_df_grp[PREDICATE_ID] > 1].drop(PREDICATE_ID, axis=1) |
| 321 | + non_predicate_reconciled_df = ( |
| 322 | + confidence_reconciled_df.merge( |
| 323 | + tmp_df_grp, on=list(tmp_df_grp.columns), how="left", indicator=True |
| 324 | + ) |
| 325 | + .query('_merge == "left_only"') |
| 326 | + .drop(columns="_merge") |
| 327 | + ) |
| 328 | + |
| 329 | + multiple_predicate_df = ( |
| 330 | + confidence_reconciled_df.merge( |
| 331 | + tmp_df_grp, on=list(tmp_df_grp.columns), how="right", indicator=True |
| 332 | + ) |
| 333 | + .query('_merge == "both"') |
| 334 | + .drop(columns="_merge") |
| 335 | + ) |
| 336 | + |
| 337 | + return_df = non_predicate_reconciled_df |
| 338 | + for _, row in tmp_df_grp.iterrows(): |
| 339 | + logic_df = multiple_predicate_df[list(tmp_df_grp.columns)] == row |
| 340 | + concerned_row_index = ( |
| 341 | + logic_df[logic_df[list(tmp_df_grp.columns)]].dropna().index |
| 342 | + ) |
| 343 | + concerned_df = multiple_predicate_df.iloc[concerned_row_index] |
| 344 | + # Go down the hierarchical list of PREDICATE_LIST and grab the first match |
| 345 | + return_df = pd.concat( |
| 346 | + [get_row_based_on_hierarchy(concerned_df), return_df], axis=0 |
| 347 | + ).drop_duplicates() |
293 | 348 |
|
294 | 349 | if return_df[CONFIDENCE].isnull().all(): |
295 | 350 | return_df = return_df.drop(columns=[CONFIDENCE], axis=1) |
296 | 351 | return return_df |
297 | 352 |
|
298 | 353 |
|
| 354 | +def get_row_based_on_hierarchy(df: pd.DataFrame): |
| 355 | + """Get row based on hierarchy of predicates. |
| 356 | +
|
| 357 | + The hierarchy is as follows: |
| 358 | + # owl:equivalentClass |
| 359 | + # owl:equivalentProperty |
| 360 | + # rdfs:subClassOf |
| 361 | + # rdfs:subPropertyOf |
| 362 | + # owl:sameAs |
| 363 | + # skos:exactMatch |
| 364 | + # skos:closeMatch |
| 365 | + # skos:broadMatch |
| 366 | + # skos:narrowMatch |
| 367 | + # oboInOwl:hasDbXref |
| 368 | + # skos:relatedMatch |
| 369 | + # rdfs:seeAlso |
| 370 | +
|
| 371 | + :param df: Dataframe containing multiple predicates for same subject and object. |
| 372 | + :return: Dataframe with a single row which ranks higher in the hierarchy. |
| 373 | + """ |
| 374 | + for pred in PREDICATE_LIST: |
| 375 | + hierarchical_df = df[df[PREDICATE_ID] == pred] |
| 376 | + if not hierarchical_df.empty: |
| 377 | + return hierarchical_df |
| 378 | + |
| 379 | + |
299 | 380 | def assign_default_confidence( |
300 | 381 | df: pd.DataFrame, |
301 | 382 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: |
@@ -430,29 +511,27 @@ def dataframe_to_ptable(df: pd.DataFrame, *, inverse_factor: float = None): |
430 | 511 | residual_confidence = (1 - (confidence + inverse_confidence)) / 2.0 |
431 | 512 |
|
432 | 513 | predicate = row[PREDICATE_ID] |
433 | | - if predicate == "owl:equivalentClass": |
| 514 | + if predicate == OWL_EQUIVALENT_CLASS: |
434 | 515 | predicate_type = PREDICATE_EQUIVALENT |
435 | | - elif predicate == "skos:exactMatch": |
| 516 | + elif predicate == SKOS_EXACT_MATCH: |
436 | 517 | predicate_type = PREDICATE_EQUIVALENT |
437 | | - elif predicate == "skos:closeMatch": |
| 518 | + elif predicate == SKOS_CLOSE_MATCH: |
438 | 519 | # TODO: consider distributing |
439 | 520 | predicate_type = PREDICATE_EQUIVALENT |
440 | | - elif predicate == "owl:subClassOf": |
| 521 | + elif predicate == RDFS_SUBCLASS_OF: |
441 | 522 | predicate_type = PREDICATE_SUBCLASS |
442 | | - elif predicate == "skos:broadMatch": |
| 523 | + elif predicate == SKOS_BROAD_MATCH: |
443 | 524 | predicate_type = PREDICATE_SUBCLASS |
444 | | - elif predicate == "inverseOf(owl:subClassOf)": |
| 525 | + elif predicate == SSSOM_SUPERCLASS_OF: |
445 | 526 | predicate_type = PREDICATE_SUPERCLASS |
446 | | - elif predicate == "skos:narrowMatch": |
| 527 | + elif predicate == SKOS_NARROW_MATCH: |
447 | 528 | predicate_type = PREDICATE_SUPERCLASS |
448 | | - elif predicate == "owl:differentFrom": |
449 | | - predicate_type = PREDICATE_SIBLING |
450 | | - elif predicate == "dbpedia-owl:different": |
| 529 | + elif predicate == OWL_DIFFERENT_FROM: |
451 | 530 | predicate_type = PREDICATE_SIBLING |
452 | 531 | # * Added by H2 ############################ |
453 | | - elif predicate == "oboInOwl:hasDbXref": |
| 532 | + elif predicate == OBO_HAS_DB_XREF: |
454 | 533 | predicate_type = PREDICATE_HAS_DBXREF |
455 | | - elif predicate == "skos:relatedMatch": |
| 534 | + elif predicate == SKOS_RELATED_MATCH: |
456 | 535 | predicate_type = PREDICATE_RELATED_MATCH |
457 | 536 | # * ######################################## |
458 | 537 | else: |
@@ -538,7 +617,7 @@ def sha256sum(path: str) -> str: |
538 | 617 |
|
539 | 618 | def merge_msdf( |
540 | 619 | *msdfs: MappingSetDataFrame, |
541 | | - reconcile: bool = True, |
| 620 | + reconcile: bool = False, |
542 | 621 | ) -> MappingSetDataFrame: |
543 | 622 | """Merge multiple MappingSetDataFrames into one. |
544 | 623 |
|
@@ -573,7 +652,10 @@ def merge_msdf( |
573 | 652 | merged_msdf.df = df_merged |
574 | 653 | if reconcile: |
575 | 654 | merged_msdf.df = filter_redundant_rows(merged_msdf.df) |
576 | | - if PREDICATE_MODIFIER in merged_msdf.df.columns: |
| 655 | + if ( |
| 656 | + PREDICATE_MODIFIER in merged_msdf.df.columns |
| 657 | + and PREDICATE_MODIFIER_NOT in merged_msdf.df[PREDICATE_MODIFIER] |
| 658 | + ): |
577 | 659 | merged_msdf.df = deal_with_negation(merged_msdf.df) # deals with negation |
578 | 660 |
|
579 | 661 | # TODO: Add default values for license and mapping_set_id. |
|
0 commit comments