feat: Allow to efficiently read and write collections (#77)

borchero · web-flow · commit 3e232a461dbf · 2025-07-08T20:51:12.000+02:00
diff --git a/dataframely/collection.py b/dataframely/collection.py
@@ -21,8 +21,13 @@
     SchemaJSONEncoder,
     serialization_versions,
 )
-from ._typing import LazyFrame
-from .exc import MemberValidationError, RuleValidationError, ValidationError
+from ._typing import LazyFrame, Validation
+from .exc import (
+    MemberValidationError,
+    RuleValidationError,
+    ValidationError,
+    ValidationRequiredError,
+)
 from .failure import FailureInfo
 from .random import Generator
 from .schema import _schema_from_dict
@@ -620,84 +625,254 @@ def serialize(cls) -> str:
 
     # ---------------------------------- PERSISTENCE --------------------------------- #
 
-    def write_parquet(self, directory: Path) -> None:
-        """Write the members of this collection to Parquet files in a directory.
+    def write_parquet(self, directory: str | Path, **kwargs: Any) -> None:
+        """Write the members of this collection to parquet files in a directory.
 
-        This method writes one Parquet file per member into the provided directory.
+        This method writes one parquet file per member into the provided directory.
         Each parquet file is named ``<member>.parquet``. No file is written for optional
         members which are not provided in the current collection.
 
+        In addition, one JSON file named ``schema.json`` is written, serializing the
+        collection's definition for fast reads.
+
         Args:
             directory: The directory where the Parquet files should be written to. If
                 the directory does not exist, it is created automatically, including all
                 of its parents.
+            kwargs: Additional keyword arguments passed directly to
+                :meth:`polars.write_parquet` of all members. ``metadata`` may only be
+                provided if it is a dictionary.
+
+        Attention:
+            This method suffers from the same limitations as :meth:`Schema.serialize`.
         """
-        directory.mkdir(parents=True, exist_ok=True)
+        self._to_parquet(directory, sink=False, **kwargs)
+
+    def sink_parquet(self, directory: str | Path, **kwargs: Any) -> None:
+        """Stream the members of this collection into parquet files in a directory.
+
+        This method writes one parquet file per member into the provided directory.
+        Each parquet file is named ``<member>.parquet``. No file is written for optional
+        members which are not provided in the current collection.
+
+        In addition, one JSON file named ``schema.json`` is written, serializing the
+        collection's definition for fast reads.
+
+        Args:
+            directory: The directory where the Parquet files should be written to. If
+                the directory does not exist, it is created automatically, including all
+                of its parents.
+            kwargs: Additional keyword arguments passed directly to
+                :meth:`polars.sink_parquet` of all members. ``metadata`` may only be
+                provided if it is a dictionary.
+
+        Attention:
+            This method suffers from the same limitations as :meth:`Schema.serialize`.
+        """
+        self._to_parquet(directory, sink=True, **kwargs)
+
+    def _to_parquet(self, directory: str | Path, *, sink: bool, **kwargs: Any) -> None:
+        path = Path(directory) if isinstance(directory, str) else directory
+        path.mkdir(parents=True, exist_ok=True)
+        with open(path / "schema.json", "w") as f:
+            f.write(self.serialize())
+
+        member_schemas = self.member_schemas()
         for key, lf in self.to_dict().items():
-            lf.collect().write_parquet(directory / f"{key}.parquet")
+            destination = (
+                path / key if "partition_by" in kwargs else path / f"{key}.parquet"
+            )
+            if sink:
+                member_schemas[key].sink_parquet(
+                    lf,  # type: ignore
+                    destination,
+                    **kwargs,
+                )
+            else:
+                member_schemas[key].write_parquet(
+                    lf.collect(),  # type: ignore
+                    destination,
+                    **kwargs,
+                )
 
     @classmethod
-    def read_parquet(cls, directory: Path) -> Self:
-        """Eagerly read and validate all collection members from Parquet file in a
-        directory.
+    def read_parquet(
+        cls,
+        directory: str | Path,
+        *,
+        validation: Validation = "warn",
+        **kwargs: Any,
+    ) -> Self:
+        """Read all collection members from parquet files in a directory.
 
         This method searches for files named ``<member>.parquet`` in the provided
         directory for all required and optional members of the collection.
 
         Args:
             directory: The directory where the Parquet files should be read from.
+                Parquet files may have been written with Hive partitioning.
+            validation: The strategy for running validation when reading the data:
+
+                - ``"allow"`: The method tries to read the ``schema.json`` file in the
+                  directory. If the stored collection schema matches this collection
+                  schema, the collection is read without validation. If the stored
+                  schema mismatches this schema or no ``schema.json`` can be found in
+                  the directory, this method automatically runs :meth:`validate` with
+                  ``cast=True``.
+                - ``"warn"`: The method behaves similarly to ``"allow"``. However,
+                  it prints a warning if validation is necessary.
+                - ``"forbid"``: The method never runs validation automatically and only
+                  returns if the ``schema.json`` stores a collection schema that matches
+                  this collection.
+                - ``"skip"``: The method never runs validation and simply reads the
+                  data, entrusting the user that the schema is valid. _Use this option
+                  carefully_.
+
+            kwargs: Additional keyword arguments passed directly to
+                :meth:`polars.read_parquet`.
 
         Returns:
             The initialized collection.
 
         Raises:
+            ValidationRequiredError: If no collection schema can be read from the
+                directory and ``validation`` is set to ``"forbid"``.
             ValueError: If the provided directory does not contain parquet files for
                 all required members.
             ValidationError: If the collection cannot be validate.
 
-        Note:
-            If you are certain that your Parquet files contain valid data, you can also
-            use :meth:`scan_parquet` to prevent the runtime overhead of validation.
+        Attention:
+            Be aware that this method suffers from the same limitations as
+            :meth:`serialize`.
         """
-        data = {
-            key: pl.scan_parquet(directory / f"{key}.parquet")
-            for key in cls.members()
-            if (directory / f"{key}.parquet").exists()
-        }
-        return cls.validate(data)
+        path = Path(directory)
+        data = cls._from_parquet(path, scan=True, **kwargs)
+        if not cls._requires_validation_for_reading_parquets(path, validation):
+            cls._validate_input_keys(data)
+            return cls._init(data)
+        return cls.validate(data, cast=True)
 
     @classmethod
-    def scan_parquet(cls, directory: Path) -> Self:
-        """Lazily read all collection members from Parquet files in a directory.
+    def scan_parquet(
+        cls,
+        directory: str | Path,
+        *,
+        validation: Validation = "warn",
+        **kwargs: Any,
+    ) -> Self:
+        """Lazily read all collection members from parquet files in a directory.
 
         This method searches for files named ``<member>.parquet`` in the provided
         directory for all required and optional members of the collection.
 
         Args:
             directory: The directory where the Parquet files should be read from.
+                Parquet files may have been written with Hive partitioning.
+            validation: The strategy for running validation when reading the data:
+
+                - ``"allow"`: The method tries to read the ``schema.json`` file in the
+                  directory. If the stored collection schema matches this collection
+                  schema, the collection is read without validation. If the stored
+                  schema mismatches this schema or no ``schema.json`` can be found in
+                  the directory, this method automatically runs :meth:`validate` with
+                  ``cast=True``.
+                - ``"warn"`: The method behaves similarly to ``"allow"``. However,
+                  it prints a warning if validation is necessary.
+                - ``"forbid"``: The method never runs validation automatically and only
+                  returns if the ``schema.json`` stores a collection schema that matches
+                  this collection.
+                - ``"skip"``: The method never runs validation and simply reads the
+                  data, entrusting the user that the schema is valid. _Use this option
+                  carefully_.
+
+            kwargs: Additional keyword arguments passed directly to
+                :meth:`polars.scan_parquet` for all members.
 
         Returns:
             The initialized collection.
 
         Raises:
+            ValidationRequiredError: If no collection schema can be read from the
+                directory and ``validation`` is set to ``"forbid"``.
             ValueError: If the provided directory does not contain parquet files for
                 all required members.
 
         Note:
-            If you want to eagerly read all Parquet files, consider calling
-            :meth:`collect_all` on the returned collection.
+            Due to current limitations in dataframely, this method actually reads the
+            parquet file into memory if ``"validation"`` is ``"warn"`` or ``"allow"``
+            and validation is required.
 
         Attention:
-            This method does **not** validate the contents of the Parquet file. Consider
-            using :meth:`read_parquet` if you want to validate the collection.
+            Be aware that this method suffers from the same limitations as
+            :meth:`serialize`.
         """
-        data = {
-            key: pl.scan_parquet(directory / f"{key}.parquet")
-            for key in cls.members()
-            if (directory / f"{key}.parquet").exists()
-        }
-        cls._validate_input_keys(data)
-        return cls._init(data)
+        path = Path(directory)
+        data = cls._from_parquet(path, scan=True, **kwargs)
+        if not cls._requires_validation_for_reading_parquets(path, validation):
+            cls._validate_input_keys(data)
+            return cls._init(data)
+        return cls.validate(data, cast=True)
+
+    @classmethod
+    def _from_parquet(
+        cls, path: Path, scan: bool, **kwargs: Any
+    ) -> dict[str, pl.LazyFrame]:
+        data = {}
+        for key in cls.members():
+            if (source_path := cls._member_source_path(path, key)) is not None:
+                data[key] = (
+                    pl.scan_parquet(source_path, **kwargs)
+                    if scan
+                    else pl.read_parquet(source_path, **kwargs).lazy()
+                )
+        return data
+
+    @classmethod
+    def _member_source_path(cls, base_path: Path, name: str) -> Path | None:
+        if (path := base_path / name).exists() and base_path.is_dir():
+            # We assume that the member is stored as a hive-partitioned dataset
+            return path
+        if (path := base_path / f"{name}.parquet").exists():
+            # We assume that the member is stored as a single parquet file
+            return path
+        return None
+
+    @classmethod
+    def _requires_validation_for_reading_parquets(
+        cls,
+        directory: Path,
+        validation: Validation,
+    ) -> bool:
+        if validation == "skip":
+            return False
+
+        # First, we check whether the path provides the serialization of the collection.
+        # If it does, we check whether it matches this collection. If it does, we assume
+        # that the data adheres to the collection and we do not need to run validation.
+        if (json_serialization := directory / "schema.json").exists():
+            metadata = json_serialization.read_text()
+            serialized_collection = deserialize_collection(metadata)
+            if cls.matches(serialized_collection):
+                return False
+        else:
+            serialized_collection = None
+
+        # Otherwise, we definitely need to run validation. However, we emit different
+        # information to the user depending on the value of `validate`.
+        msg = (
+            "current collection schema does not match stored collection schema"
+            if serialized_collection is not None
+            else "no collection schema to check validity can be read from the source"
+        )
+        if validation == "forbid":
+            raise ValidationRequiredError(
+                f"Cannot read collection from '{directory!r}' without validation: {msg}."
+            )
+        if validation == "warn":
+            warnings.warn(
+                f"Reading parquet file from '{directory!r}' requires validation: {msg}."
+            )
+        return True
 
     # ----------------------------------- UTILITIES ---------------------------------- #
 
diff --git a/dataframely/schema.py b/dataframely/schema.py
@@ -783,7 +783,7 @@ def read_parquet(
 
         Raises:
             ValidationRequiredError: If no schema information can be read from the
-                source and ``validate`` is set to ``False``.
+                source and ``validation`` is set to ``"forbid"``.
 
         Attention:
             Be aware that this method suffers from the same limitations as
@@ -824,7 +824,7 @@ def scan_parquet(
                 - ``"skip"``: The method never runs validation and simply reads the
                   parquet file, entrusting the user that the schema is valid. _Use this
                   option carefully and consider replacing it with
-                  :meth:`polars.read_parquet` to convey the purpose better_.
+                  :meth:`polars.scan_parquet` to convey the purpose better_.
 
             kwargs: Additional keyword arguments passed directly to
                 :meth:`polars.scan_parquet`.
@@ -834,11 +834,11 @@ def scan_parquet(
 
         Raises:
             ValidationRequiredError: If no schema information can be read from the
-                source and ``validate`` is set to ``False``.
+                source and ``validation`` is set to ``"forbid"``.
 
         Note:
             Due to current limitations in dataframely, this method actually reads the
-            parquet file into memory if ``validate`` is ``"auto"`` or ``True`` and
+            parquet file into memory if ``validation`` is ``"warn"`` or ``"allow"`` and
             validation is required.
 
         Attention:
diff --git a/tests/collection/test_base.py b/tests/collection/test_base.py
@@ -1,12 +1,8 @@
 # Copyright (c) QuantCo 2025-2025
 # SPDX-License-Identifier: BSD-3-Clause
 
-from collections.abc import Callable
-from pathlib import Path
-
 import polars as pl
 import pytest
-from polars.testing import assert_frame_equal
 
 import dataframely as dy
 
@@ -113,39 +109,3 @@ def test_collect_all_optional() -> None:
     assert isinstance(out, MyCollection)
     assert len(out.first.collect()) == 3
     assert out.second is None
-
-
-@pytest.mark.parametrize(
-    "read_fn", [MyCollection.scan_parquet, MyCollection.read_parquet]
-)
-def test_read_write_parquet(
-    tmp_path: Path, read_fn: Callable[[Path], MyCollection]
-) -> None:
-    collection = MyCollection.cast(
-        {
-            "first": pl.LazyFrame({"a": [1, 2, 3]}),
-            "second": pl.LazyFrame({"a": [1, 2], "b": [10, 15]}),
-        }
-    )
-    collection.write_parquet(tmp_path)
-
-    read = read_fn(tmp_path)
-    assert_frame_equal(collection.first, read.first)
-    assert collection.second is not None
-    assert read.second is not None
-    assert_frame_equal(collection.second, read.second)
-
-
-@pytest.mark.parametrize(
-    "read_fn", [MyCollection.scan_parquet, MyCollection.read_parquet]
-)
-def test_read_write_parquet_optional(
-    tmp_path: Path, read_fn: Callable[[Path], MyCollection]
-) -> None:
-    collection = MyCollection.cast({"first": pl.LazyFrame({"a": [1, 2, 3]})})
-    collection.write_parquet(tmp_path)
-
-    read = read_fn(tmp_path)
-    assert_frame_equal(collection.first, read.first)
-    assert collection.second is None
-    assert read.second is None
diff --git a/tests/collection/test_read_write_parquet.py b/tests/collection/test_read_write_parquet.py