-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Clean-up indexing adapter classes #10355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
bc94c6d
17ff7e9
2b25155
9981078
29098ac
5f09354
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,6 @@ | |
from contextlib import suppress | ||
from dataclasses import dataclass, field | ||
from datetime import timedelta | ||
from html import escape | ||
from typing import TYPE_CHECKING, Any, cast, overload | ||
|
||
import numpy as np | ||
|
@@ -1778,18 +1777,25 @@ def __init__( | |
def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] | ||
return self._dtype | ||
|
||
def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype: | ||
if dtype is None: | ||
if is_valid_numpy_dtype(self.dtype): | ||
return cast(np.dtype, self.dtype) | ||
else: | ||
return get_valid_numpy_dtype(self.array) | ||
else: | ||
return np.dtype(dtype) | ||
|
||
def __array__( | ||
self, | ||
dtype: np.typing.DTypeLike | None = None, | ||
/, | ||
*, | ||
copy: bool | None = None, | ||
) -> np.ndarray: | ||
if dtype is None and is_valid_numpy_dtype(self.dtype): | ||
dtype = cast(np.dtype, self.dtype) | ||
else: | ||
dtype = get_valid_numpy_dtype(self.array) | ||
dtype = self._get_numpy_dtype(dtype) | ||
array = self.array | ||
|
||
if isinstance(array, pd.PeriodIndex): | ||
with suppress(AttributeError): | ||
# this might not be public API | ||
|
@@ -1829,97 +1835,71 @@ def _convert_scalar(self, item) -> np.ndarray: | |
# numpy fails to convert pd.Timestamp to np.datetime64[ns] | ||
item = np.asarray(item.to_datetime64()) | ||
elif self.dtype != object: | ||
dtype = self.dtype | ||
if pd.api.types.is_extension_array_dtype(dtype): | ||
dtype = get_valid_numpy_dtype(self.array) | ||
item = np.asarray(item, dtype=cast(np.dtype, dtype)) | ||
dtype = self._get_numpy_dtype() | ||
item = np.asarray(item, dtype=dtype) | ||
|
||
# as for numpy.ndarray indexing, we always want the result to be | ||
# a NumPy array. | ||
return to_0d_array(item) | ||
|
||
def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: | ||
if isinstance(key, tuple) and len(key) == 1: | ||
def _index_get( | ||
self, indexer: ExplicitIndexer, func_name: str | ||
) -> PandasIndexingAdapter | np.ndarray: | ||
key = indexer.tuple | ||
|
||
if len(key) == 1: | ||
# unpack key so it can index a pandas.Index object (pandas.Index | ||
# objects don't like tuples) | ||
(key,) = key | ||
|
||
return key | ||
# if multidimensional key, convert the index to numpy array and index the latter | ||
if getattr(key, "ndim", 0) > 1: | ||
indexable = NumpyIndexingAdapter(np.asarray(self)) | ||
return getattr(indexable, func_name)(indexer) | ||
|
||
# otherwise index the pandas index then re-wrap or convert the result | ||
result = self.array[key] | ||
|
||
def _handle_result( | ||
self, result: Any | ||
) -> ( | ||
PandasIndexingAdapter | ||
| NumpyIndexingAdapter | ||
| np.ndarray | ||
| np.datetime64 | ||
| np.timedelta64 | ||
): | ||
if isinstance(result, pd.Index): | ||
return type(self)(result, dtype=self.dtype) | ||
else: | ||
return self._convert_scalar(result) | ||
|
||
def _oindex_get( | ||
self, indexer: OuterIndexer | ||
) -> ( | ||
PandasIndexingAdapter | ||
| NumpyIndexingAdapter | ||
| np.ndarray | ||
| np.datetime64 | ||
| np.timedelta64 | ||
): | ||
key = self._prepare_key(indexer.tuple) | ||
|
||
if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional | ||
indexable = NumpyIndexingAdapter(np.asarray(self)) | ||
return indexable.oindex[indexer] | ||
|
||
result = self.array[key] | ||
|
||
return self._handle_result(result) | ||
def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: | ||
return self._index_get(indexer, "_oindex_get") | ||
|
||
def _vindex_get( | ||
self, indexer: VectorizedIndexer | ||
) -> ( | ||
PandasIndexingAdapter | ||
| NumpyIndexingAdapter | ||
| np.ndarray | ||
| np.datetime64 | ||
| np.timedelta64 | ||
): | ||
) -> PandasIndexingAdapter | np.ndarray: | ||
_assert_not_chunked_indexer(indexer.tuple) | ||
key = self._prepare_key(indexer.tuple) | ||
|
||
if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional | ||
indexable = NumpyIndexingAdapter(np.asarray(self)) | ||
return indexable.vindex[indexer] | ||
|
||
result = self.array[key] | ||
|
||
return self._handle_result(result) | ||
return self._index_get(indexer, "_vindex_get") | ||
|
||
def __getitem__( | ||
self, indexer: ExplicitIndexer | ||
) -> ( | ||
PandasIndexingAdapter | ||
| NumpyIndexingAdapter | ||
| np.ndarray | ||
| np.datetime64 | ||
| np.timedelta64 | ||
): | ||
key = self._prepare_key(indexer.tuple) | ||
) -> PandasIndexingAdapter | np.ndarray: | ||
return self._index_get(indexer, "__getitem__") | ||
|
||
if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional | ||
indexable = NumpyIndexingAdapter(np.asarray(self)) | ||
return indexable[indexer] | ||
def transpose(self, order) -> pd.Index: | ||
return self.array # self.array should be always one-dimensional | ||
|
||
result = self.array[key] | ||
def _get_array_subset(self) -> np.ndarray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is only to share code between PandasIndexingAdapter and PandasMultiIndexAdapter? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now used directly in PandasIndexingAdapter to avoid converting a large pd.RangeIndex into a numpy array just for showing the first and last few values. In #10296 it is also used (overridden in PandasIntervalIndexingAdapter) to avoid converting a large pd.IntervalIndex into a 2-d array. EDIT: actually it isn't (yet). |
||
# avoid converting a large pd.Index (especially pd.MultiIndex and pd.RangeIndex) | ||
# into a numpy array for the array repr | ||
threshold = max(100, OPTIONS["display_values_threshold"] + 2) | ||
if self.size > threshold: | ||
pos = threshold // 2 | ||
subset_start = (self[OuterIndexer((slice(pos),))],) | ||
subset_end = (self[OuterIndexer((slice(-pos, None),))],) | ||
return np.concatenate( | ||
[np.asarray(subset_start), np.asarray(subset_end)], axis=-1 | ||
) | ||
else: | ||
return np.asarray(self) | ||
|
||
return self._handle_result(result) | ||
def _repr_inline_(self, max_width: int) -> str: | ||
from xarray.core.formatting import format_array_flat | ||
|
||
def transpose(self, order) -> pd.Index: | ||
return self.array # self.array should be always one-dimensional | ||
return format_array_flat(self._get_array_subset(), max_width) | ||
|
||
def __repr__(self) -> str: | ||
return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" | ||
|
@@ -1939,7 +1919,9 @@ def copy(self, deep: bool = True) -> Self: | |
def nbytes(self) -> int: | ||
if pd.api.types.is_extension_array_dtype(self.dtype): | ||
return self.array.nbytes | ||
return cast(np.dtype, self.dtype).itemsize * len(self.array) | ||
|
||
dtype = self._get_numpy_dtype() | ||
return dtype.itemsize * len(self.array) | ||
|
||
|
||
class PandasMultiIndexingAdapter(PandasIndexingAdapter): | ||
|
@@ -1972,56 +1954,29 @@ def __array__( | |
*, | ||
copy: bool | None = None, | ||
) -> np.ndarray: | ||
if dtype is None: | ||
dtype = cast(np.dtype, self.dtype) | ||
dtype = self._get_numpy_dtype(dtype) | ||
|
||
if self.level is not None: | ||
return np.asarray( | ||
self.array.get_level_values(self.level).values, dtype=dtype | ||
) | ||
else: | ||
return super().__array__(dtype, copy=copy) | ||
|
||
def _convert_scalar(self, item): | ||
def _convert_scalar(self, item: Any): | ||
if isinstance(item, tuple) and self.level is not None: | ||
idx = tuple(self.array.names).index(self.level) | ||
item = item[idx] | ||
return super()._convert_scalar(item) | ||
|
||
def _oindex_get( | ||
self, indexer: OuterIndexer | ||
) -> ( | ||
PandasIndexingAdapter | ||
| NumpyIndexingAdapter | ||
| np.ndarray | ||
| np.datetime64 | ||
| np.timedelta64 | ||
): | ||
result = super()._oindex_get(indexer) | ||
if isinstance(result, type(self)): | ||
result.level = self.level | ||
return result | ||
|
||
def _vindex_get( | ||
self, indexer: VectorizedIndexer | ||
) -> ( | ||
PandasIndexingAdapter | ||
| NumpyIndexingAdapter | ||
| np.ndarray | ||
| np.datetime64 | ||
| np.timedelta64 | ||
): | ||
result = super()._vindex_get(indexer) | ||
def _index_get( | ||
self, indexer: ExplicitIndexer, func_name: str | ||
) -> PandasIndexingAdapter | np.ndarray: | ||
result = super()._index_get(indexer, func_name) | ||
if isinstance(result, type(self)): | ||
result.level = self.level | ||
return result | ||
|
||
def __getitem__(self, indexer: ExplicitIndexer): | ||
result = super().__getitem__(indexer) | ||
if isinstance(result, type(self)): | ||
result.level = self.level | ||
|
||
return result | ||
|
||
def __repr__(self) -> str: | ||
if self.level is None: | ||
return super().__repr__() | ||
|
@@ -2031,31 +1986,11 @@ def __repr__(self) -> str: | |
) | ||
return f"{type(self).__name__}{props}" | ||
|
||
def _get_array_subset(self) -> np.ndarray: | ||
# used to speed-up the repr for big multi-indexes | ||
threshold = max(100, OPTIONS["display_values_threshold"] + 2) | ||
if self.size > threshold: | ||
pos = threshold // 2 | ||
indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)]) | ||
subset = self[OuterIndexer((indices,))] | ||
else: | ||
subset = self | ||
|
||
return np.asarray(subset) | ||
|
||
def _repr_inline_(self, max_width: int) -> str: | ||
from xarray.core.formatting import format_array_flat | ||
|
||
if self.level is None: | ||
return "MultiIndex" | ||
else: | ||
return format_array_flat(self._get_array_subset(), max_width) | ||
|
||
def _repr_html_(self) -> str: | ||
from xarray.core.formatting import short_array_repr | ||
|
||
array_repr = short_array_repr(self._get_array_subset()) | ||
return f"<pre>{escape(array_repr)}</pre>" | ||
return super()._repr_inline_(max_width=max_width) | ||
|
||
def copy(self, deep: bool = True) -> Self: | ||
# see PandasIndexingAdapter.copy | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't we get away with adding
_repr_inline
to CoordinateTransfromIndexingAdapter? I think it's preferable to avoid this kind of special casing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function does not format the inline repr but the data of a DataArray or variable.
We could add a
short_data_repr
method to those adapter classes and check ifinternal_data
has such a method here, although it is not much different from this special casing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess a better request is to see whether we can just reuse
format_array_flat
which already does indexing and should just work for these classes.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm again
format_array_flat
is for formatting the inline repr, whereas the special case introduced here is for formatting the data repr.Without this special case,
short_data_repr()
will convert the indexing adapters into numpy arrays viaPandasIndexingAdpater.get_duck_array()
andCoordinateTransformIndexingAdapter.get_duck_array()
over their full shape / size. For both the inline and the data reprs, we want to select only the first and last relevant items before doing this conversion.first_n_items()
andlast_n_items()
inxarray.core.formatting
do similar things thanPandasIndexingAdapter._get_array_subset()
andCoordinateTransformIndexingAdapter._get_array_subset()
. We could perhaps reuse the two former instead, although for the data repr (at least for CoordinateTransform, and possibly later for pd.IntervalIndex) we don't want a flattened result. So this would involve more refactoring. Also this wouldn't remove the special case here.Alternatively we could tweak
Variable._in_memory
such that it returns False when._data
is a PandasIndexingAdapter (only when it wraps a pd.RangeIndex) or a CoordinateTransformIndexingAdapter, which will turn their data repr from, e.g.,into
On one hand I like seeing the actual first and last values of a (lazy) range index or coordinate transform. On the other hand I find a bit confusing that is it shown like a plain numpy array.