Skip to content

Commit 76165d5

Browse files
authored
Add method to enforce compliance with a given version of the SSSOM specification. (#616)
This commit adds two helper methods to the `MappingSetDataFrame` class. The `get_compatible_version()` method is intended to automatically determine the minimum version of the SSSOM specification the set is compatible with -- that is, the earliest version that defines all the slots and all the enum values present in the set. The `enforce_compliance()` method is intended to ensure that a mapping set is compliant with a given version of the SSSOM specification, by removing any slot or slot value that has only been defined in a later version. The method can also be used to optionally remove any extra non-standard slot that has not been properly declared as an extension slot (strict=True).
1 parent 467c62e commit 76165d5

File tree

4 files changed

+308
-1
lines changed

4 files changed

+308
-1
lines changed

src/sssom/constants.py

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,28 @@
55
import importlib.resources
66
import pathlib
77
import uuid
8+
from dataclasses import dataclass
89
from enum import Enum
910
from functools import cached_property, lru_cache
10-
from typing import Any, ClassVar, Dict, List, Literal, Mapping, Set, TextIO, Union, cast
11+
from typing import (
12+
Any,
13+
ClassVar,
14+
Dict,
15+
List,
16+
Literal,
17+
Mapping,
18+
Optional,
19+
Set,
20+
TextIO,
21+
Tuple,
22+
Union,
23+
cast,
24+
)
1125

1226
import yaml
1327
from linkml_runtime.utils.schema_as_dict import schema_as_dict
1428
from linkml_runtime.utils.schemaview import SchemaView
29+
from sssom_schema.datamodel.sssom_schema import SssomVersionEnum
1530

1631
HERE = pathlib.Path(__file__).parent.resolve()
1732

@@ -77,6 +92,7 @@
7792
MAPPING_SET_VERSION = "mapping_set_version"
7893
MAPPING_SET_GROUP = "mapping_set_group"
7994
MAPPING_SET_DESCRIPTION = "mapping_set_description"
95+
MAPPING_SET_CONFIDENCE = "mapping_set_confidence"
8096
CREATOR_ID = "creator_id"
8197
CREATOR_LABEL = "creator_label"
8298
AUTHOR_ID = "author_id"
@@ -94,6 +110,7 @@
94110
MAPPING_CARDINALITY = "mapping_cardinality"
95111
CARDINALITY_SCOPE = "cardinality_scope"
96112
MAPPING_TOOL = "mapping_tool"
113+
MAPPING_TOOL_ID = "mapping_tool_id"
97114
MAPPING_TOOL_VERSION = "mapping_tool_version"
98115
MAPPING_DATE = "mapping_date"
99116
PBLICATION_DATE = "publication_date"
@@ -108,6 +125,8 @@
108125
SEE_ALSO = "see_also"
109126
OTHER = "other"
110127
COMMENT = "comment"
128+
EXTENSION_DEFINITIONS = "extension_definitions"
129+
EXTENSION_SLOT_NAME = "slot_name"
111130

112131
CURIE_MAP = "curie_map"
113132
SUBJECT_SOURCE_ID = "subject_source_id"
@@ -217,6 +236,28 @@ class SchemaValidationType(str, Enum):
217236
]
218237

219238

239+
@dataclass
240+
class NewEnumValue(object):
241+
"""Represents a enum value that had been added posteriorly to 1.0.
242+
243+
Ideally that information should be encoded in the LinkML schema and
244+
made available through the SSSOMSchemaView class below, but it does
245+
not seem possible to annotate enum values in LinkML the way it can
246+
be done for slots. So the information comes from the spec instead,
247+
at <https://mapping-commons.github.io/sssom/spec-model/#model-changes-across-versions>.
248+
"""
249+
250+
slots: list[str] # Impacted slots
251+
value: str # The new value
252+
added_in: tuple[int, int] # Version that introduced the new value
253+
254+
255+
NEW_ENUM_VALUES = [
256+
NewEnumValue([SUBJECT_TYPE, OBJECT_TYPE], "composed entity expression", (1, 1)),
257+
NewEnumValue([MAPPING_CARDINALITY], "0:0", (1, 1)),
258+
]
259+
260+
220261
class SSSOMSchemaView(object):
221262
"""SchemaView class from linkml which is instantiated when necessary.
222263
@@ -287,6 +328,54 @@ def propagatable_slots(self) -> List[str]:
287328
slots.append(slot_name)
288329
return slots
289330

331+
def get_new_enum_values(self, after: Tuple[int, int] = (1, 0)) -> List[NewEnumValue]:
332+
"""Get enum values introduced after a given version of the specification.
333+
334+
:param after: The target version of the SSSOM specification, as
335+
a (major, minor) tuple. The default is (1,0),
336+
meaning all enum values introduced in any version
337+
after 1.0 will be returned.
338+
:return: The list of newly introduced enum values.
339+
"""
340+
return [v for v in NEW_ENUM_VALUES if v.added_in > after]
341+
342+
def get_minimum_version(
343+
self, slot_name: str, class_name: str = "mapping"
344+
) -> Optional[Tuple[int, int]]:
345+
"""Get the minimum version of SSSOM required for a given slot.
346+
347+
:param slot_name: The queried slot.
348+
:param class_name: The class the slot belongs to. This is needed
349+
because a slot may have been added to a class
350+
in a later version than the version in which
351+
it was first introduced in the schema.
352+
:return: A tuple containing the major and minor numbers of the
353+
earliest version of SSSOM that defines the given slot
354+
in the given class. May be None if the requested slot
355+
name is not a valid slot name.
356+
"""
357+
try:
358+
slot = self.view.induced_slot(slot_name, class_name)
359+
return parse_sssom_version(slot.annotations.added_in.value)
360+
except AttributeError: # No added_in annotation, defaults to 1.0
361+
return (1, 0)
362+
except ValueError: # No such slot
363+
return None
364+
365+
366+
def parse_sssom_version(version: str) -> Tuple[int, int]:
367+
"""Parse a string into a valid SSSOM version number.
368+
369+
:param version: The string to parse into a version number.
370+
:return: A (major, minor) tuple.
371+
"""
372+
v = [int(n) for n in SssomVersionEnum(version).code.text.split(".")]
373+
if len(v) != 2:
374+
# Should never happen, should be caught by the SssomVersionEnum
375+
# constructor before we arrive here
376+
raise ValueError("Invalid version")
377+
return (v[0], v[1])
378+
290379

291380
@lru_cache(1)
292381
def _get_sssom_schema_object() -> SSSOMSchemaView:

src/sssom/util.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
COLUMN_INVERT_DICTIONARY,
4343
COMMENT,
4444
CONFIDENCE,
45+
EXTENSION_DEFINITIONS,
46+
EXTENSION_SLOT_NAME,
4547
MAPPING_CARDINALITY,
4648
MAPPING_JUSTIFICATION,
4749
MAPPING_SET_ID,
@@ -78,6 +80,7 @@
7880
SSSOMSchemaView,
7981
_get_sssom_schema_object,
8082
get_default_metadata,
83+
parse_sssom_version,
8184
)
8285
from .context import (
8386
SSSOM_BUILT_IN_PREFIXES,
@@ -509,6 +512,102 @@ def _to_string(row: dict[str, Any], side: str) -> str:
509512
# No scope, so remove any pre-existing "cardinality_scope" column
510513
self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore")
511514

515+
def get_compatible_version(self) -> str:
516+
"""Get the minimum version of SSSOM this set is compatible with."""
517+
schema = SSSOMSchemaView()
518+
versions: Set[Tuple[int, int]] = set()
519+
520+
# First get the minimum versions required by the slots present
521+
# in the set; this is entirely provided by the SSSOM model.
522+
for slot in self.metadata.keys():
523+
version = schema.get_minimum_version(slot, "mapping set")
524+
if version is not None:
525+
versions.add(version)
526+
for slot in self.df.columns:
527+
version = schema.get_minimum_version(slot, "mapping")
528+
if version is not None:
529+
versions.add(version)
530+
531+
# Then take care of enum values
532+
for new_enum_value in schema.get_new_enum_values():
533+
for slot in new_enum_value.slots:
534+
if self.metadata.get(slot) == new_enum_value.value or (
535+
slot in self.df.columns and new_enum_value.value in self.df[slot].values
536+
):
537+
versions.add(new_enum_value.added_in)
538+
539+
# Get the highest of the accumulated versions.
540+
return ".".join([str(i) for i in max(versions)])
541+
542+
def enforce_version(
543+
self, version: str, strict: bool = False, inplace: bool = False
544+
) -> "MappingSetDataFrame":
545+
"""Ensure the set is compliant with a given version of the SSSOM specification.
546+
547+
This method will forcefully remove any slot or enum value that
548+
is not defined in the specified version of the specification.
549+
550+
:param version: The targeted version of the specification, as a
551+
string of the form `X.Y`.
552+
:param strict: If `True`, unknown slots will be removed as well,
553+
unless they are properly declared as extensions.
554+
:param inplace: if `True`, the method will modify and return the
555+
set it has been called upon. The default is to
556+
leave that set untouched and to return a
557+
modified copy.
558+
:return: A set that is compliant with the requested version of
559+
the SSSOM specification.
560+
"""
561+
if inplace:
562+
msdf = self
563+
else:
564+
msdf = MappingSetDataFrame(df=self.df.copy(), metadata=self.metadata.copy())
565+
566+
schema = SSSOMSchemaView()
567+
target_version = parse_sssom_version(version)
568+
defined_extensions = [
569+
ext.get(EXTENSION_SLOT_NAME) for ext in msdf.metadata.get(EXTENSION_DEFINITIONS, [])
570+
]
571+
572+
# Helper method to decide whether to keep or discard a slot
573+
def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool:
574+
if version is not None:
575+
# This is a known slot, keep if compatible with target version
576+
return version <= target_version
577+
elif strict:
578+
# Unknown slot in strict mode, keep only if declared as an extension
579+
return name in defined_extensions
580+
else:
581+
# Unknown slot in non-strict mode, always keep
582+
return True
583+
584+
# First the mapping set slots
585+
to_remove = [
586+
name
587+
for name in msdf.metadata.keys()
588+
if not _keep(name, schema.get_minimum_version(name, "mapping set"))
589+
]
590+
for new_enum_value in schema.get_new_enum_values(after=target_version):
591+
for slot in new_enum_value.slots:
592+
if msdf.metadata.get(slot) == new_enum_value.value:
593+
to_remove.append(slot)
594+
for slot in to_remove:
595+
msdf.metadata.pop(slot)
596+
597+
# Then the individual mapping record slots
598+
to_remove = [
599+
name
600+
for name in msdf.df.columns
601+
if not _keep(name, schema.get_minimum_version(name, "mapping"))
602+
]
603+
msdf.df.drop(columns=to_remove, inplace=True)
604+
for new_enum_value in schema.get_new_enum_values(after=target_version):
605+
for slot in new_enum_value.slots:
606+
if slot in msdf.df.columns:
607+
msdf.df.loc[msdf.df[slot] == new_enum_value.value, slot] = ""
608+
609+
return msdf
610+
512611

513612
def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
514613
"""Standardize a CURIE or IRI, returning the original if not possible.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#sssom_version: "1.1"
2+
#curie_map:
3+
# d: http://example.org/d/
4+
# orcid: https://orcid.org/
5+
# x: http://example.org/x/
6+
# z: http://example.org/z/
7+
#mapping_set_id: https://w3id.org/sssom/mapping/tests/data/sssom11-extensions.sssom.tsv
8+
#mapping_set_confidence: 0.9
9+
#creator_id:
10+
# - orcid:1234
11+
# - orcid:5678
12+
#license: https://creativecommons.org/publicdomain/zero/1.0/
13+
#extension_definitions:
14+
# - slot_name: ext_fooable
15+
# property: d:fooableProperty
16+
# type_hint: xsd:boolean
17+
# - slot_name: ext_fooability_scale
18+
# property: d:fooableScaleProperty
19+
# type_hint: xsd:integer
20+
#ext_fooability_scale: 79
21+
#ext_undefined: bar
22+
subject_id predicate_id object_id mapping_justification subject_type mapping_tool_id mapping_date ext_fooable ext_undefined
23+
x:appendage owl:equivalentClass z:appendage semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
24+
x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching composed entity expression d:matcher 2020-05-30 false bar
25+
x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
26+
x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
27+
x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
28+
x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
29+
x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
30+
x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
31+
x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
32+
x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
33+
x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar

tests/test_utils.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,17 @@
1515
CARDINALITY_SCOPE,
1616
CREATOR_ID,
1717
MAPPING_CARDINALITY,
18+
MAPPING_SET_CONFIDENCE,
19+
MAPPING_TOOL_ID,
1820
OBJECT_ID,
1921
OBJECT_LABEL,
22+
OBJECT_TYPE,
2023
PREDICATE_ID,
24+
PREDICATE_TYPE,
2125
SEMAPV,
2226
SUBJECT_ID,
2327
SUBJECT_LABEL,
28+
SUBJECT_TYPE,
2429
MetadataType,
2530
)
2631
from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter
@@ -635,3 +640,84 @@ def test_infer_scoped_cardinality(self) -> None:
635640
expected = ["1:n", "1:n", "1:n", "1:n", "1:n", "1:n"]
636641
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
637642
self.assertNotIn(CARDINALITY_SCOPE, msdf.df.columns)
643+
644+
def test_inferring_compatible_version(self) -> None:
645+
"""Test that we can correctly infer the version a set is compatible with."""
646+
msdf10 = parse_sssom_table(f"{data_dir}/basic.tsv")
647+
648+
# Nothing in that set requires 1.1
649+
self.assertEqual("1.0", msdf10.get_compatible_version())
650+
651+
def _clone(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
652+
return MappingSetDataFrame(df=msdf.df.copy(), metadata=msdf.metadata.copy())
653+
654+
# Inject a 1.1-specific mapping set slot
655+
msdf11 = _clone(msdf10)
656+
msdf11.metadata[CARDINALITY_SCOPE] = "predicate_id"
657+
self.assertEqual("1.1", msdf11.get_compatible_version())
658+
659+
# Inject a 1.1-specific mapping slot
660+
msdf11 = _clone(msdf10)
661+
msdf11.df[PREDICATE_TYPE] = "owl object property"
662+
self.assertEqual("1.1", msdf11.get_compatible_version())
663+
664+
# Inject a 1.1-specific entity_type_enum value
665+
msdf11 = _clone(msdf10)
666+
msdf11.metadata[SUBJECT_TYPE] = "composed entity expression"
667+
self.assertEqual("1.1", msdf11.get_compatible_version())
668+
669+
# Same, but on a single mapping record
670+
msdf11 = _clone(msdf10)
671+
msdf11.df[OBJECT_TYPE] = "owl class"
672+
msdf11.df.loc[2, OBJECT_TYPE] = "composed entity expression"
673+
self.assertEqual("1.1", msdf11.get_compatible_version())
674+
675+
# Inject the 1.1-specific "0:0" cardinality value
676+
msdf11 = _clone(msdf10)
677+
msdf11.df[MAPPING_CARDINALITY] = "1:1"
678+
msdf11.df.loc[9, MAPPING_CARDINALITY] = "0:0"
679+
self.assertEqual("1.1", msdf11.get_compatible_version())
680+
681+
def test_enforce_version(self) -> None:
682+
"""Test that we can force a set to be compliant with a specific SSSOM version."""
683+
msdf11 = parse_sssom_table(f"{data_dir}/sssom11-extensions.sssom.tsv")
684+
685+
# The test set contains non-standard slots, but they are
686+
# discarded by the parser (even those properly declared as
687+
# extensions!). To be able to test the "strict" enforcing mode,
688+
# we manually reintroduce the non-standard slots here.
689+
msdf11.metadata["ext_fooability_scale"] = 79
690+
msdf11.metadata["ext_undefined"] = "bar"
691+
msdf11.df["ext_fooable"] = True
692+
msdf11.df["ext_undefined"] = "bar"
693+
694+
msdf10 = msdf11.enforce_version("1.0")
695+
# msdf11 should still have all its 1.1 slots since we are not
696+
# using inplace=True
697+
self.assertIn(MAPPING_SET_CONFIDENCE, msdf11.metadata)
698+
self.assertIn(MAPPING_TOOL_ID, msdf11.df.columns)
699+
self.assertIn("composed entity expression", msdf11.df[SUBJECT_TYPE].values)
700+
# But those slots should not be present in msdf10
701+
self.assertNotIn(MAPPING_SET_CONFIDENCE, msdf10.metadata)
702+
self.assertNotIn(MAPPING_TOOL_ID, msdf10.df.columns)
703+
self.assertNotIn("composed entity expression", msdf10.df[SUBJECT_TYPE].values)
704+
# Further confirm that msdf10 is 1.0-compliant
705+
self.assertEqual("1.0", msdf10.get_compatible_version())
706+
# Non-standard slots should all be preserved
707+
self.assertIn("ext_fooability_scale", msdf10.metadata)
708+
self.assertIn("ext_undefined", msdf10.metadata)
709+
self.assertIn("ext_fooable", msdf10.df.columns)
710+
self.assertIn("ext_undefined", msdf10.df.columns)
711+
712+
msdf10 = msdf11.enforce_version("1.0", strict=True)
713+
self.assertEqual("1.0", msdf10.get_compatible_version())
714+
# Declared non-standard slots should still be there
715+
self.assertIn("ext_fooability_scale", msdf10.metadata)
716+
self.assertIn("ext_fooable", msdf10.df.columns)
717+
# But not undeclared ones
718+
self.assertNotIn("ext_undefined", msdf10.metadata)
719+
self.assertNotIn("ext_undefined", msdf10.df.columns)
720+
721+
msdf11.enforce_version("1.0", inplace=True)
722+
# now msdf11 itself should be 1.0-compliant
723+
self.assertEqual("1.0", msdf11.get_compatible_version())

0 commit comments

Comments
 (0)