Skip to content

Commit 9c6d527

Browse files
authored
feat: Expose function to read serialized schema from parquet metadata (#82)
1 parent 6c9305e commit 9c6d527

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

dataframely/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
filter_relationship_one_to_at_least_one,
5252
filter_relationship_one_to_one,
5353
)
54-
from .schema import Schema, deserialize_schema
54+
from .schema import Schema, deserialize_schema, read_parquet_metadata_schema
5555

5656
__all__ = [
5757
"random",
@@ -69,6 +69,7 @@
6969
"filter_relationship_one_to_one",
7070
"Schema",
7171
"deserialize_schema",
72+
"read_parquet_metadata_schema",
7273
"Any",
7374
"Bool",
7475
"Column",

dataframely/schema.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -860,21 +860,20 @@ def _requires_validation_for_reading_parquet(
860860
# First, we check whether the source provides the dataframely schema. If it
861861
# does, we check whether it matches this schema. If it does, we assume that the
862862
# data adheres to the schema and we do not need to run validation.
863-
metadata = (
864-
pl.read_parquet_metadata(source).get(SCHEMA_METADATA_KEY)
863+
serialized_schema = (
864+
read_parquet_metadata_schema(source)
865865
if not isinstance(source, list)
866866
else None
867867
)
868-
if metadata is not None:
869-
serialized_schema = deserialize_schema(metadata)
868+
if serialized_schema is not None:
870869
if cls.matches(serialized_schema):
871870
return False
872871

873872
# Otherwise, we definitely need to run validation. However, we emit different
874873
# information to the user depending on the value of `validate`.
875874
msg = (
876875
"current schema does not match stored schema"
877-
if metadata is not None
876+
if serialized_schema is not None
878877
else "no schema to check validity can be read from the source"
879878
)
880879
if validation == "forbid":
@@ -956,6 +955,24 @@ def _rules_match(lhs: dict[str, Rule], rhs: dict[str, Rule]) -> bool:
956955
)
957956

958957

958+
def read_parquet_metadata_schema(
959+
source: str | Path | IO[bytes] | bytes,
960+
) -> type[Schema] | None:
961+
"""Read a dataframely schema from the metadata of a parquet file.
962+
963+
Args:
964+
source: Path to a parquet file or a file-like object that contains the metadata.
965+
966+
Returns:
967+
The schema that was serialized to the metadata or ``None`` if no schema metadata
968+
is found.
969+
"""
970+
metadata = pl.read_parquet_metadata(source)
971+
if (schema_metadata := metadata.get(SCHEMA_METADATA_KEY)) is not None:
972+
return deserialize_schema(schema_metadata)
973+
return None
974+
975+
959976
def deserialize_schema(data: str) -> type[Schema]:
960977
"""Deserialize a schema from a JSON string.
961978

0 commit comments

Comments
 (0)