Skip to content

Clean-up indexing adapter classes #10355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion xarray/core/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@

from xarray.core.datatree_render import RenderDataTree
from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel
from xarray.core.indexing import MemoryCachedArray
from xarray.core.indexing import (
CoordinateTransformIndexingAdapter,
MemoryCachedArray,
PandasIndexingAdapter,
)
from xarray.core.options import OPTIONS, _get_boolean_with_default
from xarray.core.treenode import group_subtrees
from xarray.core.utils import is_duck_array
Expand Down Expand Up @@ -651,6 +655,12 @@ def short_array_repr(array):
def short_data_repr(array):
"""Format "data" for DataArray and Variable."""
internal_data = getattr(array, "variable", array)._data

if isinstance(
internal_data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we get away with adding_repr_inline to CoordinateTransfromIndexingAdapter? I think it's preferable to avoid this kind of special casing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function does not format the inline repr but the data of a DataArray or variable.

We could add a short_data_repr method to those adapter classes and check if internal_data has such a method here, although it is not much different from this special casing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess a better request is to see whether we can just reuse format_array_flat which already does indexing and should just work for these classes.

Copy link
Member Author

@benbovy benbovy May 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm again format_array_flat is for formatting the inline repr, whereas the special case introduced here is for formatting the data repr.

Without this special case, short_data_repr() will convert the indexing adapters into numpy arrays via PandasIndexingAdpater.get_duck_array() and CoordinateTransformIndexingAdapter.get_duck_array() over their full shape / size. For both the inline and the data reprs, we want to select only the first and last relevant items before doing this conversion.

first_n_items() and last_n_items() in xarray.core.formatting do similar things than PandasIndexingAdapter._get_array_subset() and CoordinateTransformIndexingAdapter._get_array_subset(). We could perhaps reuse the two former instead, although for the data repr (at least for CoordinateTransform, and possibly later for pd.IntervalIndex) we don't want a flattened result. So this would involve more refactoring. Also this wouldn't remove the special case here.

Alternatively we could tweak Variable._in_memory such that it returns False when ._data is a PandasIndexingAdapter (only when it wraps a pd.RangeIndex) or a CoordinateTransformIndexingAdapter, which will turn their data repr from, e.g.,

>>> xidx = PandasIndex(pd.RangeIndex(2_000_000_000), "x")
>>> ds = xr.Dataset(coords=xr.Coordinates.from_xindex(xidx))
>>> print(ds.x.variable)
<xarray.IndexVariable 'x' (x: 2000000000)> Size: 16GB
array([[         0,          1,          2, ..., 1999999997, 1999999998,
        1999999999]])

into

>>> print(ds.x.variable)
<xarray.IndexVariable 'x' (x: 2000000000)> Size: 16GB
[2000000000 values with dtype=dtype('int64')]

On one hand I like seeing the actual first and last values of a (lazy) range index or coordinate transform. On the other hand I find a bit confusing that is it shown like a plain numpy array.

):
array = internal_data._get_array_subset()

if isinstance(array, np.ndarray):
return short_array_repr(array)
elif is_duck_array(internal_data):
Expand Down
187 changes: 61 additions & 126 deletions xarray/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from contextlib import suppress
from dataclasses import dataclass, field
from datetime import timedelta
from html import escape
from typing import TYPE_CHECKING, Any, cast, overload

import numpy as np
Expand Down Expand Up @@ -1778,18 +1777,25 @@ def __init__(
def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override]
return self._dtype

def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype:
if dtype is None:
if is_valid_numpy_dtype(self.dtype):
return cast(np.dtype, self.dtype)
else:
return get_valid_numpy_dtype(self.array)
else:
return np.dtype(dtype)

def __array__(
self,
dtype: np.typing.DTypeLike | None = None,
/,
*,
copy: bool | None = None,
) -> np.ndarray:
if dtype is None and is_valid_numpy_dtype(self.dtype):
dtype = cast(np.dtype, self.dtype)
else:
dtype = get_valid_numpy_dtype(self.array)
dtype = self._get_numpy_dtype(dtype)
array = self.array

if isinstance(array, pd.PeriodIndex):
with suppress(AttributeError):
# this might not be public API
Expand Down Expand Up @@ -1829,97 +1835,71 @@ def _convert_scalar(self, item) -> np.ndarray:
# numpy fails to convert pd.Timestamp to np.datetime64[ns]
item = np.asarray(item.to_datetime64())
elif self.dtype != object:
dtype = self.dtype
if pd.api.types.is_extension_array_dtype(dtype):
dtype = get_valid_numpy_dtype(self.array)
item = np.asarray(item, dtype=cast(np.dtype, dtype))
dtype = self._get_numpy_dtype()
item = np.asarray(item, dtype=dtype)

# as for numpy.ndarray indexing, we always want the result to be
# a NumPy array.
return to_0d_array(item)

def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]:
if isinstance(key, tuple) and len(key) == 1:
def _index_get(
self, indexer: ExplicitIndexer, func_name: str
) -> PandasIndexingAdapter | np.ndarray:
key = indexer.tuple

if len(key) == 1:
# unpack key so it can index a pandas.Index object (pandas.Index
# objects don't like tuples)
(key,) = key

return key
# if multidimensional key, convert the index to numpy array and index the latter
if getattr(key, "ndim", 0) > 1:
indexable = NumpyIndexingAdapter(np.asarray(self))
return getattr(indexable, func_name)(indexer)

# otherwise index the pandas index then re-wrap or convert the result
result = self.array[key]

def _handle_result(
self, result: Any
) -> (
PandasIndexingAdapter
| NumpyIndexingAdapter
| np.ndarray
| np.datetime64
| np.timedelta64
):
if isinstance(result, pd.Index):
return type(self)(result, dtype=self.dtype)
else:
return self._convert_scalar(result)

def _oindex_get(
self, indexer: OuterIndexer
) -> (
PandasIndexingAdapter
| NumpyIndexingAdapter
| np.ndarray
| np.datetime64
| np.timedelta64
):
key = self._prepare_key(indexer.tuple)

if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional
indexable = NumpyIndexingAdapter(np.asarray(self))
return indexable.oindex[indexer]

result = self.array[key]

return self._handle_result(result)
def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray:
return self._index_get(indexer, "_oindex_get")

def _vindex_get(
self, indexer: VectorizedIndexer
) -> (
PandasIndexingAdapter
| NumpyIndexingAdapter
| np.ndarray
| np.datetime64
| np.timedelta64
):
) -> PandasIndexingAdapter | np.ndarray:
_assert_not_chunked_indexer(indexer.tuple)
key = self._prepare_key(indexer.tuple)

if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional
indexable = NumpyIndexingAdapter(np.asarray(self))
return indexable.vindex[indexer]

result = self.array[key]

return self._handle_result(result)
return self._index_get(indexer, "_vindex_get")

def __getitem__(
self, indexer: ExplicitIndexer
) -> (
PandasIndexingAdapter
| NumpyIndexingAdapter
| np.ndarray
| np.datetime64
| np.timedelta64
):
key = self._prepare_key(indexer.tuple)
) -> PandasIndexingAdapter | np.ndarray:
return self._index_get(indexer, "__getitem__")

if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional
indexable = NumpyIndexingAdapter(np.asarray(self))
return indexable[indexer]
def transpose(self, order) -> pd.Index:
return self.array # self.array should be always one-dimensional

result = self.array[key]
def _get_array_subset(self) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is only to share code between PandasIndexingAdapter and PandasMultiIndexAdapter?

Copy link
Member Author

@benbovy benbovy May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now used directly in PandasIndexingAdapter to avoid converting a large pd.RangeIndex into a numpy array just for showing the first and last few values.

In #10296 it is also used (overridden in PandasIntervalIndexingAdapter) to avoid converting a large pd.IntervalIndex into a 2-d array. EDIT: actually it isn't (yet).

# avoid converting a large pd.Index (especially pd.MultiIndex and pd.RangeIndex)
# into a numpy array for the array repr
threshold = max(100, OPTIONS["display_values_threshold"] + 2)
if self.size > threshold:
pos = threshold // 2
subset_start = (self[OuterIndexer((slice(pos),))],)
subset_end = (self[OuterIndexer((slice(-pos, None),))],)
return np.concatenate(
[np.asarray(subset_start), np.asarray(subset_end)], axis=-1
)
else:
return np.asarray(self)

return self._handle_result(result)
def _repr_inline_(self, max_width: int) -> str:
from xarray.core.formatting import format_array_flat

def transpose(self, order) -> pd.Index:
return self.array # self.array should be always one-dimensional
return format_array_flat(self._get_array_subset(), max_width)

def __repr__(self) -> str:
return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})"
Expand All @@ -1939,7 +1919,9 @@ def copy(self, deep: bool = True) -> Self:
def nbytes(self) -> int:
if pd.api.types.is_extension_array_dtype(self.dtype):
return self.array.nbytes
return cast(np.dtype, self.dtype).itemsize * len(self.array)

dtype = self._get_numpy_dtype()
return dtype.itemsize * len(self.array)


class PandasMultiIndexingAdapter(PandasIndexingAdapter):
Expand Down Expand Up @@ -1972,56 +1954,29 @@ def __array__(
*,
copy: bool | None = None,
) -> np.ndarray:
if dtype is None:
dtype = cast(np.dtype, self.dtype)
dtype = self._get_numpy_dtype(dtype)

if self.level is not None:
return np.asarray(
self.array.get_level_values(self.level).values, dtype=dtype
)
else:
return super().__array__(dtype, copy=copy)

def _convert_scalar(self, item):
def _convert_scalar(self, item: Any):
if isinstance(item, tuple) and self.level is not None:
idx = tuple(self.array.names).index(self.level)
item = item[idx]
return super()._convert_scalar(item)

def _oindex_get(
self, indexer: OuterIndexer
) -> (
PandasIndexingAdapter
| NumpyIndexingAdapter
| np.ndarray
| np.datetime64
| np.timedelta64
):
result = super()._oindex_get(indexer)
if isinstance(result, type(self)):
result.level = self.level
return result

def _vindex_get(
self, indexer: VectorizedIndexer
) -> (
PandasIndexingAdapter
| NumpyIndexingAdapter
| np.ndarray
| np.datetime64
| np.timedelta64
):
result = super()._vindex_get(indexer)
def _index_get(
self, indexer: ExplicitIndexer, func_name: str
) -> PandasIndexingAdapter | np.ndarray:
result = super()._index_get(indexer, func_name)
if isinstance(result, type(self)):
result.level = self.level
return result

def __getitem__(self, indexer: ExplicitIndexer):
result = super().__getitem__(indexer)
if isinstance(result, type(self)):
result.level = self.level

return result

def __repr__(self) -> str:
if self.level is None:
return super().__repr__()
Expand All @@ -2031,31 +1986,11 @@ def __repr__(self) -> str:
)
return f"{type(self).__name__}{props}"

def _get_array_subset(self) -> np.ndarray:
# used to speed-up the repr for big multi-indexes
threshold = max(100, OPTIONS["display_values_threshold"] + 2)
if self.size > threshold:
pos = threshold // 2
indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)])
subset = self[OuterIndexer((indices,))]
else:
subset = self

return np.asarray(subset)

def _repr_inline_(self, max_width: int) -> str:
from xarray.core.formatting import format_array_flat

if self.level is None:
return "MultiIndex"
else:
return format_array_flat(self._get_array_subset(), max_width)

def _repr_html_(self) -> str:
from xarray.core.formatting import short_array_repr

array_repr = short_array_repr(self._get_array_subset())
return f"<pre>{escape(array_repr)}</pre>"
return super()._repr_inline_(max_width=max_width)

def copy(self, deep: bool = True) -> Self:
# see PandasIndexingAdapter.copy
Expand Down
Loading