Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# Release notes

## v2.1.3 (unreleased)

### New Features

- Allow nested-groups inside `ManifestStore` and `ManifestGroup` objects and update `HDFParser` to be able to create nested `zarr.Group` objects.
([#790](https://github.com/zarr-developers/VirtualiZarr/pull/790)).
By [Ilan Gold](https://github.com/ilan-gold)

### Breaking changes

### Bug fixes

### Documentation

### Internal changes

## v2.1.2 (3rd September 2025)

Patch release with minor bug fixes for the DMRPParser and Icechunk writing behavior.
Expand Down
8 changes: 1 addition & 7 deletions virtualizarr/manifests/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,7 @@ def __init__(
self._metadata = GroupMetadata(attributes=attributes)

_arrays: Mapping[str, ManifestArray] = {} if arrays is None else arrays

if groups:
# TODO add support for nested groups
raise NotImplementedError
else:
_groups: Mapping[str, ManifestGroup] = {} if groups is None else groups

_groups: Mapping[str, ManifestGroup] = {} if groups is None else groups
for name, arr in _arrays.items():
if not isinstance(arr, ManifestArray):
raise TypeError(
Expand Down
40 changes: 23 additions & 17 deletions virtualizarr/manifests/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype
from zarr.core.common import BytesLike

from virtualizarr.manifests.array import ManifestArray
from virtualizarr.manifests.group import ManifestGroup
from virtualizarr.manifests.utils import construct_chunk_pattern
from virtualizarr.registry import ObjectStoreRegistry
Expand Down Expand Up @@ -97,6 +98,16 @@ def parse_manifest_index(
return tuple(int(ind) for ind in chunk_component.split(chunk_key_encoding))


def get_deepest_group_or_array(
node: ManifestGroup, key: list[str]
) -> ManifestGroup | ManifestArray:
for var in key:
if var in node.arrays:
return node.arrays[var]
node = node.groups[var]
return node


class ManifestStore(Store):
"""
A read-only Zarr store that uses obstore to read data from inside arbitrary files on AWS, GCP, Azure, or a local filesystem.
Expand Down Expand Up @@ -158,25 +169,20 @@ async def get(
byte_range: ByteRequest | None = None,
) -> Buffer | None:
# docstring inherited

if key == "zarr.json":
# Return group metadata
return self._group.metadata.to_buffer_dict(
prototype=default_buffer_prototype()
)["zarr.json"]
elif key.endswith("zarr.json"):
# Return array metadata
# TODO: Handle nested groups
var, _ = key.split("/")
return self._group.arrays[var].metadata.to_buffer_dict(
prototype=default_buffer_prototype()
)["zarr.json"]
var = key.split("/")[0]
marr = self._group.arrays[var]
manifest = marr.manifest
node = get_deepest_group_or_array(self._group, key.split("/")[:-1])
if key.endswith("zarr.json"):
# Return metadata
return node.metadata.to_buffer_dict(prototype=default_buffer_prototype())[
"zarr.json"
]
if isinstance(node, ManifestGroup):
raise ValueError(
"Key requested is a group but the key does not end in `zarr.json`"
)
manifest = node.manifest

chunk_indexes = parse_manifest_index(
key, marr.metadata.chunk_key_encoding.separator
key, node.metadata.chunk_key_encoding.separator
)

path = manifest._paths[chunk_indexes]
Expand Down
17 changes: 14 additions & 3 deletions virtualizarr/parsers/hdf/hdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import math
from pathlib import Path
from typing import (
TYPE_CHECKING,
Iterable,
Expand Down Expand Up @@ -98,7 +99,6 @@ def _construct_manifest_group(
"""
Construct a virtual Group from a HDF dataset.
"""

import h5py

with h5py.File(reader, mode="r") as f:
Expand All @@ -116,11 +116,22 @@ def _construct_manifest_group(
arrays = {
key: _construct_manifest_array(filepath, dataset, group_name)
for key in g.keys()
if key not in drop_variables and isinstance(dataset := g[key], h5py.Dataset)
if key not in drop_variables
if isinstance(dataset := g[key], h5py.Dataset)
}
groups = {
key: _construct_manifest_group(
filepath,
reader,
group=str(Path(group) / key) if group is not None else key,
)
for key in g.keys()
if key not in drop_variables
if isinstance(g[key], h5py.Group)
}
attributes = _extract_attrs(g)

return ManifestGroup(arrays=arrays, attributes=attributes)
return ManifestGroup(arrays=arrays, groups=groups, attributes=attributes)


class HDFParser:
Expand Down
24 changes: 21 additions & 3 deletions virtualizarr/tests/test_manifests/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def _generate_manifest_store(
"bar": manifest_array,
"scalar": scalar_manifest_array,
},
groups={"subgroup": ManifestGroup(arrays={"foo": manifest_array})},
attributes={"Zarr": "Hooray!"},
)
registry = ObjectStoreRegistry({prefix: store})
Expand Down Expand Up @@ -205,7 +206,10 @@ def empty_memory_store():
fill_value=0,
)
manifest_array = ManifestArray(metadata=array_metadata, chunkmanifest=manifest)
manifest_group = ManifestGroup(arrays={"foo": manifest_array})
sub_group = ManifestGroup(arrays={"foo": manifest_array})
manifest_group = ManifestGroup(
arrays={"foo": manifest_array}, groups={"subgroup": sub_group}
)
registry = ObjectStoreRegistry({"memory://": store})
return ManifestStore(registry=registry, group=manifest_group)

Expand All @@ -229,6 +233,16 @@ async def test_get_empty_chunk(self, manifest_store, request):
observed = await store.get("foo/c.0.0", prototype=default_buffer_prototype())
assert observed is None

@pytest.mark.asyncio
@pytest.mark.parametrize(
"manifest_store",
["empty_memory_store"],
)
async def test_get_group_key_fails(self, manifest_store, request):
store = request.getfixturevalue(manifest_store)
with pytest.raises(ValueError, match=r"Key requested is a group"):
await store.get("subgroup", prototype=default_buffer_prototype())

@pytest.mark.asyncio
@pytest.mark.parametrize(
"manifest_store",
Expand Down Expand Up @@ -269,10 +283,14 @@ async def test_get_data(self, manifest_store, request):
"manifest_store",
["local_store", pytest.param("s3_store", marks=requires_minio)],
)
async def test_get_metadata(self, manifest_store, request):
@pytest.mark.parametrize(
"subgroup",
["", "subgroup/"],
)
async def test_get_metadata(self, manifest_store, request, subgroup):
store = request.getfixturevalue(manifest_store)
observed = await store.get(
"foo/zarr.json", prototype=default_buffer_prototype()
f"{subgroup}foo/zarr.json", prototype=default_buffer_prototype()
)
metadata = json.loads(observed.to_bytes())
assert metadata["chunk_grid"]["configuration"]["chunk_shape"] == [2, 2]
Expand Down
3 changes: 2 additions & 1 deletion virtualizarr/tests/test_parsers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ def nested_group_hdf5_url(tmp_path: Path) -> str:
g = f.create_group("group")
data = np.random.random((10, 10))
g.create_dataset("data", data=data)
g.create_group("nested_group")
g_nested = g.create_group("nested_group")
g_nested.create_dataset("data", data=data)

return f"file://{filepath}"

Expand Down
20 changes: 19 additions & 1 deletion virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pytest
import xarray as xr
import zarr
from obstore.store import from_url

from virtualizarr import open_virtual_dataset
Expand Down Expand Up @@ -129,12 +130,29 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_url):
manifest_store = manifest_store_from_hdf_url(chunked_dimensions_netcdf4_url)
assert len(manifest_store._group.arrays) == 3

def test_nested_groups_are_ignored(self, nested_group_hdf5_url):
def test_nested_groups_are_ignored_when_group_is_specificed(
self, nested_group_hdf5_url
):
manifest_store = manifest_store_from_hdf_url(
nested_group_hdf5_url, group="group"
)
assert len(manifest_store._group.arrays) == 1

def test_nested_groups_are_detected(self, nested_group_hdf5_url):
manifest_store = manifest_store_from_hdf_url(nested_group_hdf5_url)
assert len(manifest_store._group["group"]["nested_group"].arrays) == 1

def test_nested_data(self, nested_group_hdf5_url):
manifest_store = manifest_store_from_hdf_url(nested_group_hdf5_url)
z = zarr.open_group(manifest_store, mode="r", zarr_format=3)

with h5py.File(nested_group_hdf5_url.split("file://")[-1], mode="r") as f:
np.testing.assert_array_equal(f["group"]["data"], z["group"]["data"][...])
np.testing.assert_array_equal(
f["group"]["nested_group"]["data"][...],
z["group"]["nested_group"]["data"][...],
)

def test_drop_variables(self, multiple_datasets_hdf5_url, local_registry):
parser = HDFParser(drop_variables=["data2"])
manifest_store = parser(url=multiple_datasets_hdf5_url, registry=local_registry)
Expand Down
1 change: 1 addition & 0 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ def construct_virtual_dataset(

"""

# TODO: Remove private API `._group`
if group:
raise NotImplementedError("ManifestStore does not yet support nested groups")
else:
Expand Down