Skip to content

Numcodecs in v3 #3037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions src/zarr/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from importlib.metadata import entry_points as get_entry_points
from typing import TYPE_CHECKING, Any, Generic, TypeVar

import numcodecs

from zarr.core.config import BadConfigError, config

if TYPE_CHECKING:
Expand Down Expand Up @@ -166,6 +168,20 @@
return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type]


def numcodec_to_zarr3_codec(codec: numcodecs.abc.Codec) -> Codec:
"""
Convert a numcodecs codec to a zarr v3 compatible numcodecs.zarr3 codec instance.
"""
codec_config = codec.get_config()
codec_name = codec_config.pop("id", None)
if codec_name is None:
raise ValueError(f"Codec configuration does not contain 'id': {codec_config}")
codec_cls = get_codec_class(f"numcodecs.{codec_name}")
if codec_cls is None:
raise ValueError(f"Codec class for 'numcodecs.{codec_name}' not found.")
return codec_cls.from_dict({"name": f"numcodecs.{codec_name}", "configuration": codec_config})

Check warning on line 182 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L175-L182

Added lines #L175 - L182 were not covered by tests


def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec:
"""
Normalize the input to a ``BytesBytesCodec`` instance.
Expand All @@ -174,15 +190,17 @@
"""
from zarr.abc.codec import BytesBytesCodec

if isinstance(data, dict):
if isinstance(data, numcodecs.abc.Codec):
result = numcodec_to_zarr3_codec(data)

Check warning on line 194 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L194

Added line #L194 was not covered by tests
elif isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, BytesBytesCodec):
msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead."
raise TypeError(msg)
else:
if not isinstance(data, BytesBytesCodec):
raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.")
result = data
if not isinstance(result, BytesBytesCodec):
raise TypeError(f"Expected a BytesBytesCodec. Got {type(result)} instead.")

Check warning on line 203 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L203

Added line #L203 was not covered by tests
return result


Expand All @@ -194,15 +212,17 @@
"""
from zarr.abc.codec import ArrayBytesCodec

if isinstance(data, dict):
if isinstance(data, numcodecs.abc.Codec):
result = numcodec_to_zarr3_codec(data)

Check warning on line 216 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L216

Added line #L216 was not covered by tests
elif isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, ArrayBytesCodec):
msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead."
raise TypeError(msg)
else:
if not isinstance(data, ArrayBytesCodec):
raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.")
result = data
if not isinstance(result, ArrayBytesCodec):
raise TypeError(f"Expected a ArrayBytesCodec. Got {type(result)} instead.")

Check warning on line 225 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L225

Added line #L225 was not covered by tests
return result


Expand All @@ -214,15 +234,17 @@
"""
from zarr.abc.codec import ArrayArrayCodec

if isinstance(data, dict):
if isinstance(data, numcodecs.abc.Codec):
result = numcodec_to_zarr3_codec(data)
elif isinstance(data, dict):

Check warning on line 239 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L237-L239

Added lines #L237 - L239 were not covered by tests
result = _resolve_codec(data)
if not isinstance(result, ArrayArrayCodec):
msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead."
raise TypeError(msg)
else:
if not isinstance(data, ArrayArrayCodec):
raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.")
result = data
if not isinstance(result, ArrayArrayCodec):
raise TypeError(f"Expected a ArrayArrayCodec. Got {type(result)} instead.")

Check warning on line 247 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L246-L247

Added lines #L246 - L247 were not covered by tests
return result


Expand Down
75 changes: 75 additions & 0 deletions tests/test_codecs/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

import numcodecs
import numcodecs.zarr3
import numpy as np
import pytest

import zarr
import zarr.api
import zarr.api.asynchronous
from zarr import Array, AsyncArray, config
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
from zarr.codecs import (
BytesCodec,
GzipCodec,
Expand All @@ -23,7 +26,9 @@
from zarr.storage import StorePath

if TYPE_CHECKING:
from zarr.abc.codec import Codec
from zarr.abc.store import Store
from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike
from zarr.core.buffer.core import NDArrayLikeOrScalar
from zarr.core.common import ChunkCoords, MemoryOrder

Expand Down Expand Up @@ -413,3 +418,73 @@ async def test_resize(store: Store) -> None:
assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None
assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None
assert await store.get(f"{path}/1.1", prototype=default_buffer_prototype()) is None


@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
@pytest.mark.parametrize(
("codec_v2", "expected_v3_cls"),
[
(numcodecs.BZ2(), numcodecs.zarr3.BZ2),
(numcodecs.CRC32(), numcodecs.zarr3.CRC32),
(numcodecs.CRC32C(), numcodecs.zarr3.CRC32C),
(numcodecs.LZ4(), numcodecs.zarr3.LZ4),
(numcodecs.LZMA(), numcodecs.zarr3.LZMA),
# (numcodecs.ZFPY(), numcodecs.zarr3.ZFPY), AttributeError: module 'numcodecs' has no attribute 'ZFPY'
(numcodecs.Adler32(), numcodecs.zarr3.Adler32),
(
numcodecs.AsType(encode_dtype=np.float64, decode_dtype=np.float32),
numcodecs.zarr3.AsType,
),
(numcodecs.BitRound(keepbits=10), numcodecs.zarr3.BitRound),
(numcodecs.Blosc(), numcodecs.zarr3.Blosc),
(numcodecs.Delta(dtype=np.float64), numcodecs.zarr3.Delta),
(
numcodecs.FixedScaleOffset(offset=1000, scale=10, dtype="f8", astype="u1"),
numcodecs.zarr3.FixedScaleOffset,
),
(numcodecs.Fletcher32(), numcodecs.zarr3.Fletcher32),
(numcodecs.GZip(), numcodecs.zarr3.GZip),
(numcodecs.JenkinsLookup3(), numcodecs.zarr3.JenkinsLookup3),
# (numcodecs.PCodec(), numcodecs.zarr3.PCodec), AttributeError: module 'numcodecs' has no attribute 'PCodec'
(numcodecs.PackBits(), numcodecs.zarr3.PackBits),
(numcodecs.Quantize(digits=1, dtype="f8"), numcodecs.zarr3.Quantize),
(numcodecs.Shuffle(), numcodecs.zarr3.Shuffle),
(numcodecs.Zlib(), numcodecs.zarr3.Zlib),
(numcodecs.Zstd(), numcodecs.zarr3.Zstd),
],
)
def test_numcodecs_in_v3(
store: Store, codec_v2: numcodecs.abc.Codec, expected_v3_cls: type[Codec]
) -> None:
import zarr.registry

result_v3 = zarr.registry.numcodec_to_zarr3_codec(codec_v2)

assert result_v3.__class__ == expected_v3_cls
assert result_v3.to_dict()["name"] == f"numcodecs.{codec_v2.codec_id}"
codec_v2_config = codec_v2.get_config()
codec_v2_config.pop("id")
assert result_v3.to_dict()["configuration"] == codec_v2_config

filters: FiltersLike = "auto"
serializer: SerializerLike = "auto"
compressors: CompressorsLike = "auto"
if isinstance(result_v3, ArrayArrayCodec):
filters = [codec_v2]
elif isinstance(result_v3, ArrayBytesCodec):
serializer = codec_v2
elif isinstance(result_v3, BytesBytesCodec):
compressors = [codec_v2]
else:
raise TypeError(f"unsupported type: {result_v3.__class__}")

zarr.create_array(
store,
shape=(64,),
chunks=(64,),
dtype=np.bool,
fill_value=False,
filters=filters,
compressors=compressors,
serializer=serializer,
)
Loading