From f625f5dce261972af0ba5140b61abf4dc61076a0 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 16:04:30 +0200 Subject: [PATCH 01/25] implement `compute` and `load` --- xarray/core/datatree.py | 83 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index e9e30da5f05..3676500ae3e 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -49,6 +49,8 @@ parse_dims_as_set, ) from xarray.core.variable import Variable +from xarray.namedarray.parallelcompat import get_chunked_array_type +from xarray.namedarray.pycompat import is_chunked_array try: from xarray.core.variable import calculate_dimensions @@ -1896,3 +1898,84 @@ def apply_indexers(dataset, node_indexers): indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "sel") return self._selective_indexing(apply_indexers, indexers) + + def load(self, **kwargs) -> Self: + """Manually trigger loading and/or computation of this datatree's data + from disk or a remote source into memory and return this datatree. + Unlike compute, the original datatree is modified and returned. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. However, this method can be necessary when + working with many file objects on disk. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments passed on to ``dask.compute``. + + See Also + -------- + dask.compute + """ + # access .data to coerce everything to numpy or dask arrays + lazy_data = { + path: { + k: v._data + for k, v in node.variables.items() + if is_chunked_array(v._data) + } + for path, node in self.subtree_with_keys + } + flat_lazy_data = { + (path, var_name): array + for path, node in lazy_data.items() + for var_name, array in node.items() + } + if lazy_data: + chunkmanager = get_chunked_array_type(*flat_lazy_data.values()) + + # evaluate all the chunked arrays simultaneously + evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute( + *flat_lazy_data.values(), **kwargs + ) + + for (path, var_name), data in zip( + flat_lazy_data, evaluated_data, strict=False + ): + self[path].variables[var_name].data = data + + # load everything else sequentially + for node in self.subtree: + for k, v in node.variables.items(): + if k not in lazy_data: + v.load() + + return self + + def compute(self, **kwargs) -> Self: + """Manually trigger loading and/or computation of this datatree's data + from disk or a remote source into memory and return a new dataset. + Unlike load, the original dataset is left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. However, this method can be necessary when + working with many file objects on disk. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments passed on to ``dask.compute``. + + Returns + ------- + object : DataTree + New object with lazy data variables and coordinates as in-memory arrays. + + See Also + -------- + dask.compute + """ + new = self.copy(deep=False) + return new.load(**kwargs) From 507fb7d4a458e8f3294b72e614ff1713fcfbbb6c Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 16:04:46 +0200 Subject: [PATCH 02/25] also shallow-copy variables --- xarray/core/datatree.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 3676500ae3e..0c487d917ce 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -864,9 +864,9 @@ def _copy_node( ) -> Self: """Copy just one node of a tree.""" new_node = super()._copy_node(inherit=inherit, deep=deep, memo=memo) - data = self._to_dataset_view(rebuild_dims=False, inherit=inherit) - if deep: - data = data._copy(deep=True, memo=memo) + data = self._to_dataset_view(rebuild_dims=False, inherit=inherit)._copy( + deep=deep, memo=memo + ) new_node._set_node_data(data) return new_node From ce1683a7029fb34c682554746224595c0ff049c7 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 16:35:29 +0200 Subject: [PATCH 03/25] implement `chunksizes` --- xarray/core/datatree.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 0c487d917ce..48b08aacebf 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -18,7 +18,7 @@ from xarray.core._aggregations import DataTreeAggregations from xarray.core._typed_ops import DataTreeOpsMixin from xarray.core.alignment import align -from xarray.core.common import TreeAttrAccessMixin +from xarray.core.common import TreeAttrAccessMixin, get_chunksizes from xarray.core.coordinates import Coordinates, DataTreeCoordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, DataVariables @@ -1979,3 +1979,21 @@ def compute(self, **kwargs) -> Self: """ new = self.copy(deep=False) return new.load(**kwargs) + + @property + def chunksizes(self) -> Mapping[Hashable, tuple[int, ...]]: + """ + Mapping from group paths to a mapping of dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + + Cannot be modified directly, but can be modified by calling .chunk(). + + See Also + -------- + DataTree.chunk + Dataset.chunksizes + """ + return { + f"/{path}" if path != "." else "/": get_chunksizes(node.variables.values()) + for path, node in self.subtree_with_keys + } From f2a4683d9656630f3ff8cd54d80963b7e395efc3 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 16:44:39 +0200 Subject: [PATCH 04/25] add tests for `load` --- xarray/tests/test_datatree.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 3be3fbd620d..f06553d5117 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -13,7 +13,12 @@ from xarray.core.datatree import DataTree from xarray.core.treenode import NotFoundInTreeError from xarray.testing import assert_equal, assert_identical -from xarray.tests import assert_array_equal, create_test_data, source_ndarray +from xarray.tests import ( + assert_array_equal, + create_test_data, + requires_dask, + source_ndarray, +) ON_WINDOWS = sys.platform == "win32" @@ -2195,3 +2200,28 @@ def test_close_dataset(self, tree_and_closers): # with tree: # pass + + +@requires_dask +class TestDask: + def test_load(self): + ds1 = xr.Dataset({"a": ("x", np.arange(10))}) + ds2 = xr.Dataset({"b": ("y", np.arange(5))}) + ds3 = xr.Dataset({"c": ("z", np.arange(4))}) + ds4 = xr.Dataset({"d": ("x", np.arange(-5, 5))}) + + expected = xr.DataTree.from_dict( + {"/": ds1, "/group1": ds2, "/group2": ds3, "/group1/subgroup1": ds4} + ) + tree = xr.DataTree.from_dict( + { + "/": ds1.chunk({"x": 5}), + "/group1": ds2.chunk({"y": 3}), + "/group2": ds3.chunk({"z": 2}), + "/group1/subgroup1": ds4.chunk({"x": 5}), + } + ) + actual = tree.load() + + assert_identical(actual, expected) + # assert_chunks_equal(actual, expected, enforce_dask=False) From f0ff30ff5eeb91161b44ddc25f3ab0fe383f02cb Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 16:59:36 +0200 Subject: [PATCH 05/25] add tests for `chunksizes` --- xarray/tests/test_datatree.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index f06553d5117..96722704002 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2204,6 +2204,25 @@ def test_close_dataset(self, tree_and_closers): @requires_dask class TestDask: + def test_chunksizes(self): + ds1 = xr.Dataset({"a": ("x", np.arange(10))}) + ds2 = xr.Dataset({"b": ("y", np.arange(5))}) + ds3 = xr.Dataset({"c": ("z", np.arange(4))}) + ds4 = xr.Dataset({"d": ("x", np.arange(-5, 5))}) + + groups = { + "/": ds1.chunk({"x": 5}), + "/group1": ds2.chunk({"y": 3}), + "/group2": ds3.chunk({"z": 2}), + "/group1/subgroup1": ds4.chunk({"x": 5}), + } + + tree = xr.DataTree.from_dict(groups) + + expected_chunksizes = {path: node.chunksizes for path, node in groups.items()} + + assert tree.chunksizes == expected_chunksizes + def test_load(self): ds1 = xr.Dataset({"a": ("x", np.arange(10))}) ds2 = xr.Dataset({"b": ("y", np.arange(5))}) From d12203c2148145abb63723043eafd7ab2b8ba1fb Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 16:59:53 +0200 Subject: [PATCH 06/25] improve the `load` tests using `DataTree.chunksizes` --- xarray/tests/test_datatree.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 96722704002..b7c6c7443f2 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2240,7 +2240,10 @@ def test_load(self): "/group1/subgroup1": ds4.chunk({"x": 5}), } ) + expected_chunksizes = { + f"/{path}" if path != "." else "/": {} for path, _ in tree.subtree_with_keys + } actual = tree.load() assert_identical(actual, expected) - # assert_chunks_equal(actual, expected, enforce_dask=False) + assert tree.chunksizes == expected_chunksizes From dda02ed7ad3b2c22cb0f07535df1f580208a5a30 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 17:00:19 +0200 Subject: [PATCH 07/25] add a test for `compute` --- xarray/tests/test_datatree.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index b7c6c7443f2..c9be7a3d11b 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2247,3 +2247,31 @@ def test_load(self): assert_identical(actual, expected) assert tree.chunksizes == expected_chunksizes + + def test_compute(self): + ds1 = xr.Dataset({"a": ("x", np.arange(10))}) + ds2 = xr.Dataset({"b": ("y", np.arange(5))}) + ds3 = xr.Dataset({"c": ("z", np.arange(4))}) + ds4 = xr.Dataset({"d": ("x", np.arange(-5, 5))}) + + expected = xr.DataTree.from_dict( + {"/": ds1, "/group1": ds2, "/group2": ds3, "/group1/subgroup1": ds4} + ) + tree = xr.DataTree.from_dict( + { + "/": ds1.chunk({"x": 5}), + "/group1": ds2.chunk({"y": 3}), + "/group2": ds3.chunk({"z": 2}), + "/group1/subgroup1": ds4.chunk({"x": 5}), + } + ) + original_chunksizes = tree.chunksizes + expected_chunksizes = { + f"/{path}" if path != "." else "/": {} for path, _ in tree.subtree_with_keys + } + actual = tree.compute() + + assert_identical(actual, expected) + + assert actual.chunksizes == expected_chunksizes, "mismatching chunksizes" + assert tree.chunksizes == original_chunksizes, "original tree was modified" From 329c6899778d9a19293859c6e928fc360c070c70 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 17:00:39 +0200 Subject: [PATCH 08/25] un-xfail a xpassing test --- xarray/tests/test_datatree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index c9be7a3d11b..43a45389b17 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -863,7 +863,6 @@ def test_to_dict(self): actual = DataTree.from_dict(tree.children["a"].to_dict(relative=True)) assert_identical(expected, actual) - @pytest.mark.xfail def test_roundtrip_unnamed_root(self, simple_datatree) -> None: # See GH81 From c9fb461fa678e224f6c150b71165255e5af4a78f Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 17:15:04 +0200 Subject: [PATCH 09/25] implement and test `DataTree.chunk` --- xarray/core/datatree.py | 82 +++++++++++++++++++++++++++++++++++ xarray/tests/test_datatree.py | 23 ++++++++++ 2 files changed, 105 insertions(+) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 48b08aacebf..e955a2de669 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -70,8 +70,11 @@ ErrorOptions, ErrorOptionsWithWarn, NetcdfWriteModes, + T_ChunkDimFreq, + T_ChunksFreq, ZarrWriteModes, ) + from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint # """ # DEVELOPERS' NOTE @@ -1997,3 +2000,82 @@ def chunksizes(self) -> Mapping[Hashable, tuple[int, ...]]: f"/{path}" if path != "." else "/": get_chunksizes(node.variables.values()) for path, node in self.subtree_with_keys } + + def chunk( + self, + chunks: T_ChunksFreq = {}, # noqa: B006 # {} even though it's technically unsafe, is being used intentionally here (#4667) + name_prefix: str = "xarray-", + token: str | None = None, + lock: bool = False, + inline_array: bool = False, + chunked_array_type: str | ChunkManagerEntrypoint | None = None, + from_array_kwargs=None, + **chunks_kwargs: T_ChunkDimFreq, + ) -> Self: + """Coerce all arrays in all groups in this tree into dask arrays with the given + chunks. + + Non-dask arrays in this tree will be converted to dask arrays. Dask + arrays will be rechunked to the given chunk sizes. + + If neither chunks is not provided for one or more dimensions, chunk + sizes along that dimension will not be updated; non-dask arrays will be + converted into dask arrays with a single block. + + Along datetime-like dimensions, a :py:class:`groupers.TimeResampler` object is also accepted. + + Parameters + ---------- + chunks : int, tuple of int, "auto" or mapping of hashable to int or a TimeResampler, optional + Chunk sizes along each dimension, e.g., ``5``, ``"auto"``, or + ``{"x": 5, "y": 5}`` or ``{"x": 5, "time": TimeResampler(freq="YE")}``. + name_prefix : str, default: "xarray-" + Prefix for the name of any new dask arrays. + token : str, optional + Token uniquely identifying this dataset. + lock : bool, default: False + Passed on to :py:func:`dask.array.from_array`, if the array is not + already as dask array. + inline_array: bool, default: False + Passed on to :py:func:`dask.array.from_array`, if the array is not + already as dask array. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict, optional + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example, with dask as the default chunked array type, this method would pass additional kwargs + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + **chunks_kwargs : {dim: chunks, ...}, optional + The keyword arguments form of ``chunks``. + One of chunks or chunks_kwargs must be provided + + Returns + ------- + chunked : xarray.DataTree + + See Also + -------- + Dataset.chunk + Dataset.chunksizes + xarray.unify_chunks + dask.array.from_array + """ + return DataTree.from_dict( + { + path: node.dataset.chunk( + chunks, + name_prefix=name_prefix, + token=token, + lock=lock, + inline_array=inline_array, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + **chunks_kwargs, + ) + for path, node in self.subtree_with_keys + }, + name=self.name, + ) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 43a45389b17..e17f7ecee25 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2274,3 +2274,26 @@ def test_compute(self): assert actual.chunksizes == expected_chunksizes, "mismatching chunksizes" assert tree.chunksizes == original_chunksizes, "original tree was modified" + + def test_chunk(self): + ds1 = xr.Dataset({"a": ("x", np.arange(10))}) + ds2 = xr.Dataset({"b": ("y", np.arange(5))}) + ds3 = xr.Dataset({"c": ("z", np.arange(4))}) + ds4 = xr.Dataset({"d": ("x", np.arange(-5, 5))}) + + expected = xr.DataTree.from_dict( + { + "/": ds1.chunk({"x": 5}), + "/group1": ds2.chunk({"y": 3}), + "/group2": ds3.chunk({"z": 2}), + "/group1/subgroup1": ds4.chunk({"x": 5}), + } + ) + + tree = xr.DataTree.from_dict( + {"/": ds1, "/group1": ds2, "/group2": ds3, "/group1/subgroup1": ds4} + ) + actual = tree.chunk({"x": 5, "y": 3, "z": 2}) + + assert_identical(actual, expected) + assert actual.chunksizes == expected.chunksizes From 0305fc56f766b97cab545899b47869bb54d974e5 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:20:38 +0200 Subject: [PATCH 10/25] link to `Dataset.load` Co-authored-by: Tom Nicholas --- xarray/core/datatree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index e955a2de669..eb55ae7bab6 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1919,6 +1919,7 @@ def load(self, **kwargs) -> Self: See Also -------- + Dataset.load dask.compute """ # access .data to coerce everything to numpy or dask arrays From e2a3a1460c6db59665cd13e9a52d63fc68b5bf82 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:25:41 +0200 Subject: [PATCH 11/25] use `tree.subtree` to get absolute paths --- xarray/core/datatree.py | 3 +-- xarray/tests/test_datatree.py | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index eb55ae7bab6..efa4a54d5b1 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1998,8 +1998,7 @@ def chunksizes(self) -> Mapping[Hashable, tuple[int, ...]]: Dataset.chunksizes """ return { - f"/{path}" if path != "." else "/": get_chunksizes(node.variables.values()) - for path, node in self.subtree_with_keys + node.path: get_chunksizes(node.variables.values()) for node in self.subtree } def chunk( diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index e17f7ecee25..923c351e62c 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2239,9 +2239,7 @@ def test_load(self): "/group1/subgroup1": ds4.chunk({"x": 5}), } ) - expected_chunksizes = { - f"/{path}" if path != "." else "/": {} for path, _ in tree.subtree_with_keys - } + expected_chunksizes = {node.path: {} for node in tree.subtree} actual = tree.load() assert_identical(actual, expected) @@ -2265,9 +2263,7 @@ def test_compute(self): } ) original_chunksizes = tree.chunksizes - expected_chunksizes = { - f"/{path}" if path != "." else "/": {} for path, _ in tree.subtree_with_keys - } + expected_chunksizes = {node.path: {} for node in tree.subtree} actual = tree.compute() assert_identical(actual, expected) From 7f57ffab94dac17a32cc33dd8a61d7dd6f61d0b2 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:34:38 +0200 Subject: [PATCH 12/25] filter out missing dims before delegating to `Dataset.chunk` --- xarray/core/datatree.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index efa4a54d5b1..268d9527ba1 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -2063,19 +2063,28 @@ def chunk( xarray.unify_chunks dask.array.from_array """ - return DataTree.from_dict( - { - path: node.dataset.chunk( - chunks, - name_prefix=name_prefix, - token=token, - lock=lock, - inline_array=inline_array, - chunked_array_type=chunked_array_type, - from_array_kwargs=from_array_kwargs, - **chunks_kwargs, - ) - for path, node in self.subtree_with_keys - }, - name=self.name, - ) + # don't support deprecated ways of passing chunks + if not isinstance(chunks, Mapping): + raise TypeError( + f"invalid type for chunks: {type(chunks)}. Only mappings are supported." + ) + combined_chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + + rechunked_groups = { + path: node.dataset.chunk( + { + dim: size + for dim, size in combined_chunks.items() + if dim in node.dataset.dims + }, + name_prefix=name_prefix, + token=token, + lock=lock, + inline_array=inline_array, + chunked_array_type=chunked_array_type, + from_array_kwargs=from_array_kwargs, + ) + for path, node in self.subtree_with_keys + } + + return DataTree.from_dict(rechunked_groups, name=self.name) From 900701bd38e249e227740e071e6b185fa2c9a666 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:41:13 +0200 Subject: [PATCH 13/25] fix the type hints for `DataTree.chunksizes` --- xarray/core/datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 268d9527ba1..d4615013cbe 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1985,7 +1985,7 @@ def compute(self, **kwargs) -> Self: return new.load(**kwargs) @property - def chunksizes(self) -> Mapping[Hashable, tuple[int, ...]]: + def chunksizes(self) -> Mapping[str, Mapping[Hashable, tuple[int, ...]]]: """ Mapping from group paths to a mapping of dimension names to block lengths for this dataset's data, or None if the underlying data is not a dask array. From d45dbd0c6d07a47d601392daeb49773220692e56 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:42:23 +0200 Subject: [PATCH 14/25] try using `self.from_dict` instead --- xarray/core/datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index d4615013cbe..e8a319ce9d9 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -2087,4 +2087,4 @@ def chunk( for path, node in self.subtree_with_keys } - return DataTree.from_dict(rechunked_groups, name=self.name) + return self.from_dict(rechunked_groups, name=self.name) From 5f88937036dbfc6caa6e19e404d56b1bf4ecca65 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:45:28 +0200 Subject: [PATCH 15/25] type-hint intermediate test variables --- xarray/tests/test_datatree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 923c351e62c..e9dc8f5e274 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -1,6 +1,7 @@ import re import sys import typing +from collections.abc import Mapping from copy import copy, deepcopy from textwrap import dedent @@ -2239,6 +2240,7 @@ def test_load(self): "/group1/subgroup1": ds4.chunk({"x": 5}), } ) + expected_chunksizes: Mapping[str, Mapping] expected_chunksizes = {node.path: {} for node in tree.subtree} actual = tree.load() @@ -2263,6 +2265,7 @@ def test_compute(self): } ) original_chunksizes = tree.chunksizes + expected_chunksizes: Mapping[str, Mapping] expected_chunksizes = {node.path: {} for node in tree.subtree} actual = tree.compute() From 39d95f67469220733cc99b9eebdacdff46da1331 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:58:36 +0200 Subject: [PATCH 16/25] use `_node_dims` instead --- xarray/core/datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index e8a319ce9d9..8ea362cfb30 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -2075,7 +2075,7 @@ def chunk( { dim: size for dim, size in combined_chunks.items() - if dim in node.dataset.dims + if dim in node._node_dims }, name_prefix=name_prefix, token=token, From 73b44666527be31728cb48e93cc9bf829f4d98b2 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 19:59:01 +0200 Subject: [PATCH 17/25] raise on unknown chunk dim --- xarray/core/datatree.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 8ea362cfb30..46f08e2bac4 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -2070,6 +2070,14 @@ def chunk( ) combined_chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk") + all_dims = self._get_all_dims() + + bad_dims = combined_chunks.keys() - all_dims + if bad_dims: + raise ValueError( + f"chunks keys {tuple(bad_dims)} not found in data dimensions {tuple(all_dims)}" + ) + rechunked_groups = { path: node.dataset.chunk( { From 8b35676bdac13ad501a2bd7eaa94e56a72a34d57 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:01:18 +0200 Subject: [PATCH 18/25] check that errors in `chunk` are raised properly --- xarray/tests/test_datatree.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index e9dc8f5e274..b6f15611bc7 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2296,3 +2296,12 @@ def test_chunk(self): assert_identical(actual, expected) assert actual.chunksizes == expected.chunksizes + + with pytest.raises(TypeError, match="invalid type"): + tree.chunk(None) + + with pytest.raises(TypeError, match="invalid type"): + tree.chunk((1, 2)) + + with pytest.raises(ValueError, match="not found in data dimensions"): + tree.chunk({"u": 2}) From d70f4f0ab660456801252294edb0da9c1e62be23 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:05:24 +0200 Subject: [PATCH 19/25] adapt the docstrings of the new methods --- xarray/core/datatree.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 46f08e2bac4..9b1a9dae085 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1959,8 +1959,8 @@ def load(self, **kwargs) -> Self: def compute(self, **kwargs) -> Self: """Manually trigger loading and/or computation of this datatree's data - from disk or a remote source into memory and return a new dataset. - Unlike load, the original dataset is left unaltered. + from disk or a remote source into memory and return a new datatree. + Unlike load, the original datatree is left unaltered. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -1987,7 +1987,7 @@ def compute(self, **kwargs) -> Self: @property def chunksizes(self) -> Mapping[str, Mapping[Hashable, tuple[int, ...]]]: """ - Mapping from group paths to a mapping of dimension names to block lengths for this dataset's data, or None if + Mapping from group paths to a mapping of dimension names to block lengths for this datatree's data, or None if the underlying data is not a dask array. Cannot be modified directly, but can be modified by calling .chunk(). @@ -2032,7 +2032,7 @@ def chunk( name_prefix : str, default: "xarray-" Prefix for the name of any new dask arrays. token : str, optional - Token uniquely identifying this dataset. + Token uniquely identifying this datatree. lock : bool, default: False Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. @@ -2040,7 +2040,7 @@ def chunk( Passed on to :py:func:`dask.array.from_array`, if the array is not already as dask array. chunked_array_type: str, optional - Which chunked array type to coerce this datasets' arrays to. + Which chunked array type to coerce this datatree's arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict, optional From 3e9745f46a36f028f04ad98bd77de01da0c1a59e Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:21:40 +0200 Subject: [PATCH 20/25] allow computing / loading unchunked trees --- xarray/core/datatree.py | 2 +- xarray/tests/test_datatree.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 9b1a9dae085..4e7ce712d28 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1936,7 +1936,7 @@ def load(self, **kwargs) -> Self: for path, node in lazy_data.items() for var_name, array in node.items() } - if lazy_data: + if flat_lazy_data: chunkmanager = get_chunked_array_type(*flat_lazy_data.values()) # evaluate all the chunked arrays simultaneously diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index b6f15611bc7..1fa93d9853d 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2229,9 +2229,9 @@ def test_load(self): ds3 = xr.Dataset({"c": ("z", np.arange(4))}) ds4 = xr.Dataset({"d": ("x", np.arange(-5, 5))}) - expected = xr.DataTree.from_dict( - {"/": ds1, "/group1": ds2, "/group2": ds3, "/group1/subgroup1": ds4} - ) + groups = {"/": ds1, "/group1": ds2, "/group2": ds3, "/group1/subgroup1": ds4} + + expected = xr.DataTree.from_dict(groups) tree = xr.DataTree.from_dict( { "/": ds1.chunk({"x": 5}), @@ -2246,6 +2246,12 @@ def test_load(self): assert_identical(actual, expected) assert tree.chunksizes == expected_chunksizes + assert actual.chunksizes == expected_chunksizes + + tree = xr.DataTree.from_dict(groups) + actual = tree.load() + assert_identical(actual, expected) + assert actual.chunksizes == expected_chunksizes def test_compute(self): ds1 = xr.Dataset({"a": ("x", np.arange(10))}) From b6c5f9acc706b4dfb02096e1c0724122dd9ede65 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:31:38 +0200 Subject: [PATCH 21/25] reword the `chunksizes` properties --- xarray/core/dataset.py | 12 ++++++++---- xarray/core/datatree.py | 5 +++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ab3e2901dcf..66ceea17b91 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2658,8 +2658,10 @@ def info(self, buf: IO | None = None) -> None: @property def chunks(self) -> Mapping[Hashable, tuple[int, ...]]: """ - Mapping from dimension names to block lengths for this dataset's data, or None if - the underlying data is not a dask array. + Mapping from dimension names to block lengths for this dataset's data. + + If this dataset does not contain chunked arrays, the mapping will be empty. + Cannot be modified directly, but can be modified by calling .chunk(). Same as Dataset.chunksizes, but maintained for backwards compatibility. @@ -2675,8 +2677,10 @@ def chunks(self) -> Mapping[Hashable, tuple[int, ...]]: @property def chunksizes(self) -> Mapping[Hashable, tuple[int, ...]]: """ - Mapping from dimension names to block lengths for this dataset's data, or None if - the underlying data is not a dask array. + Mapping from dimension names to block lengths for this dataset's data. + + If this dataset does not contain chunked arrays, the mapping will be empty. + Cannot be modified directly, but can be modified by calling .chunk(). Same as Dataset.chunks. diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 4e7ce712d28..4819c424ae8 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1987,8 +1987,9 @@ def compute(self, **kwargs) -> Self: @property def chunksizes(self) -> Mapping[str, Mapping[Hashable, tuple[int, ...]]]: """ - Mapping from group paths to a mapping of dimension names to block lengths for this datatree's data, or None if - the underlying data is not a dask array. + Mapping from group paths to a mapping of chunksizes. + + If there's no chunked data in a group, the corresponding mapping of chunksizes will be empty. Cannot be modified directly, but can be modified by calling .chunk(). From da8df3627b08c37fe59ddd525a36090fac1f5558 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:32:49 +0200 Subject: [PATCH 22/25] also freeze the top-level chunk sizes --- xarray/core/datatree.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 4819c424ae8..d4ee2621557 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1998,9 +1998,12 @@ def chunksizes(self) -> Mapping[str, Mapping[Hashable, tuple[int, ...]]]: DataTree.chunk Dataset.chunksizes """ - return { - node.path: get_chunksizes(node.variables.values()) for node in self.subtree - } + return Frozen( + { + node.path: get_chunksizes(node.variables.values()) + for node in self.subtree + } + ) def chunk( self, From b11a1ef3d1758804213b1d5efedd7b8a872ae9e9 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:41:23 +0200 Subject: [PATCH 23/25] also reword `DataArray.chunksizes` --- xarray/core/dataarray.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 826acc7c7e9..d657c0c5e6b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1343,8 +1343,10 @@ def chunks(self) -> tuple[tuple[int, ...], ...] | None: @property def chunksizes(self) -> Mapping[Any, tuple[int, ...]]: """ - Mapping from dimension names to block lengths for this dataarray's data, or None if - the underlying data is not a dask array. + Mapping from dimension names to block lengths for this dataset's data. + + If this dataset does not contain chunked arrays, the mapping will be empty. + Cannot be modified directly, but can be modified by calling .chunk(). Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes From 53c0897469e4e1aa533dcc6c67757f243048eb76 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:48:14 +0200 Subject: [PATCH 24/25] fix a copy-paste error --- xarray/core/dataarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d657c0c5e6b..db7824d8c90 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1343,9 +1343,9 @@ def chunks(self) -> tuple[tuple[int, ...], ...] | None: @property def chunksizes(self) -> Mapping[Any, tuple[int, ...]]: """ - Mapping from dimension names to block lengths for this dataset's data. + Mapping from dimension names to block lengths for this dataarray's data. - If this dataset does not contain chunked arrays, the mapping will be empty. + If this dataarray does not contain chunked arrays, the mapping will be empty. Cannot be modified directly, but can be modified by calling .chunk(). From f7e31b4ef977db1933d9cf9e3d6f506a43085740 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 20:49:30 +0200 Subject: [PATCH 25/25] same for `NamedArray.chunksizes` --- xarray/namedarray/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py index 6f5ed671de8..b753d26d622 100644 --- a/xarray/namedarray/core.py +++ b/xarray/namedarray/core.py @@ -725,8 +725,10 @@ def chunksizes( self, ) -> Mapping[_Dim, _Shape]: """ - Mapping from dimension names to block lengths for this namedArray's data, or None if - the underlying data is not a dask array. + Mapping from dimension names to block lengths for this NamedArray's data. + + If this NamedArray does not contain chunked arrays, the mapping will be empty. + Cannot be modified directly, but can be modified by calling .chunk(). Differs from NamedArray.chunks because it returns a mapping of dimensions to chunk shapes