Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions src/sssom/sexpr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Generate canonical s-expressions and mapping hashes."""

import hashlib

import curies
import zbase32

from sssom import Mapping
from sssom.constants import _get_sssom_schema_object

__all__ = [
"get_mapping_hash",
]


def get_mapping_hash(mapping: Mapping, converter: curies.Converter) -> str:
"""Hash the mapping by converting to canonical s-expression, sha256 hashing, then zbase32 encoding."""
s = hashlib.sha256()
s.update(to_sexpr(mapping, converter).encode("utf-8"))
dig = s.digest()
return zbase32.encode(dig)


SKIP_SLOTS = {"record_id", "mapping_cardinality"}


def _should_expand(slot: str) -> bool:
return slot in _get_sssom_schema_object().entity_reference_slots


def to_sexpr(x: Mapping, converter: curies.Converter) -> str:
# todo get canonical order

schema_object = _get_sssom_schema_object()
rv = "(7:mapping("
for slot in schema_object.mapping_slots:
if slot in SKIP_SLOTS:
continue
value = getattr(x, slot, None)
if not value:
continue
elif isinstance(value, str):
if _should_expand(slot):
value = converter.expand_or_standardize(value, strict=True)
# TODO check if it's an entity reference and should be expanded
rv += f"({len(slot)}:{slot}{len(value)}:{value})"
elif isinstance(value, float):
raise NotImplementedError
elif isinstance(value, list):
rv += f"({len(slot)}:{slot}("
for v in value:
if _should_expand(slot):
v = converter.expand_or_standardize(v, strict=True)
rv += f"{len(v)}:{v}"
rv += "))"
return rv + "))"
7 changes: 7 additions & 0 deletions tests/data/sexpr_test.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#curie_map:
# FBbt: "http://purl.obolibrary.org/obo/FBbt_"
# UBERON: "http://purl.obolibrary.org/obo/UBERON_"
# sssom.record: "https://example.org/sssom.record/"
# orcid: "https://orcid.org/"
record_id subject_id predicate_id object_id mapping_justification creator_id reviewer_id author_label mapping_tool_version similarity_score comment license author_id mapping_tool object_type predicate_modifier reviewer_label issue_tracker_item subject_source object_match_field mapping_provider subject_label object_category subject_source_version subject_preprocessing subject_category object_label mapping_source predicate_label curation_rule_text similarity_measure see_also publication_date mapping_date other object_source mapping_cardinality subject_type confidence subject_match_field curation_rule object_source_version object_preprocessing match_string creator_label
sssom.record:hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty FBbt:00001234 skos:exactMatch UBERON:0005678 semapv:ManualMappingCuration orcid:0000-0000-1234-5678|orcid:0000-0000-5678-1234 ventral abdominal es5 exact match someone
103 changes: 103 additions & 0 deletions tests/test_sexpr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Test s-expressions."""

import re
import unittest
from pathlib import Path

import pandas as pd
from curies import Converter

import sssom.io
from sssom import Mapping
from sssom.constants import SSSOMSchemaView
from sssom.sexpr import get_mapping_hash, to_sexpr

HERE = Path(__file__).parent.resolve()
PATH = HERE.joinpath("data", "sexpr_test.sssom.tsv")


class TestSExpressions(unittest.TestCase):
"""Test creation of canonical S-expressions."""

def test_explicit_example(self) -> None:
"""Test a hard-coded example, explicit in the code."""
converter = Converter.from_prefix_map(
{
"FBbt": "http://purl.obolibrary.org/obo/FBbt_",
"UBERON": "http://purl.obolibrary.org/obo/UBERON_",
"orcid": "https://orcid.org/",
"semapv": "https://w3id.org/semapv/vocab/",
"skos": "http://www.w3.org/2004/02/skos/core#",
}
)
sexpr = """
(7:mapping(
(10:subject_id44:http://purl.obolibrary.org/obo/FBbt_00001234)
(12:predicate_id46:http://www.w3.org/2004/02/skos/core#exactMatch)
(9:object_id45:http://purl.obolibrary.org/obo/UBERON_0005678)
(21:mapping_justification51:https://w3id.org/semapv/vocab/ManualMappingCuration)
(10:creator_id(
37:https://orcid.org/0000-0000-1234-5678
37:https://orcid.org/0000-0000-5678-1234
))
))
"""
mapping = Mapping(
subject_id="http://purl.obolibrary.org/obo/FBbt_00001234",
predicate_id="http://www.w3.org/2004/02/skos/core#exactMatch",
object_id="http://purl.obolibrary.org/obo/UBERON_0005678",
mapping_justification="https://w3id.org/semapv/vocab/ManualMappingCuration",
creator_id=[
"https://orcid.org/0000-0000-1234-5678",
"https://orcid.org/0000-0000-5678-1234",
],
)
self.assertEqual(re.sub(r"\s", "", sexpr), to_sexpr(mapping, converter))
self.assertEqual(
"hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty",
get_mapping_hash(mapping, converter),
)

def test_test_completion(self) -> None:
"""Test that the example file is complete over the whole SSSOM schema."""
view = SSSOMSchemaView()

df = pd.read_csv(PATH, sep="\t", comment="#")
missing = set(view.mapping_slots).difference(df.columns)
if missing:
msg = "\n".join(sorted(missing))
self.fail(msg=f"comprehensive testing file is missing slots:\n{msg}")

for slot in view.mapping_slots:
with self.subTest(slot=slot):
series = df[slot]
self.assertTrue(series.any(), msg=f"there is no row that has a value for: {slot}")

values = series.unique()
if slot in view.multivalued_slots:
self.assertTrue(
any("|" in value for value in values),
msg=f"missing a multi-valued example for slot: {slot}",
)
self.assertTrue(
any("|" not in value for value in values),
msg=f"missing a single valued example for slot: {slot}",
)
else:
self.assertFalse(
any("|" in value for value in values),
msg=f"should not have a pipe delimiter in single valued slot: {slot}",
)

def test_all(self) -> None:
"""Test all."""
msdf = sssom.parse_tsv(PATH)

# After new SSSOM schema release, this will be part of the mapping data model
record_ids = pd.read_csv(PATH, sep="\t", comment="#")["record_id"]
for record_id, mapping in zip(record_ids, msdf.to_mappings()):
self.assertEqual(
record_id.removeprefix("sssom.record:"),
get_mapping_hash(mapping, msdf.converter),
msg=to_sexpr(mapping, msdf.converter),
)
Loading