diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index feaea311db8..d64523a4a9f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,13 +24,13 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.3 + rev: v0.14.6 hooks: - id: ruff-check args: ["--fix", "--show-fixes"] - id: ruff-format - repo: https://github.com/keewis/blackdoc - rev: v0.4.5 + rev: v0.4.6 hooks: - id: blackdoc exclude: "generate_aggregations.py" @@ -76,6 +76,6 @@ repos: - id: validate-pyproject additional_dependencies: ["validate-pyproject-schema-store[all]"] - repo: https://github.com/adhtruong/mirrors-typos - rev: v1.39.0 + rev: v1.39.2 hooks: - id: typos diff --git a/doc/api/dataarray.rst b/doc/api/dataarray.rst index 9d4e81c8677..8e4c2e77e11 100644 --- a/doc/api/dataarray.rst +++ b/doc/api/dataarray.rst @@ -162,6 +162,7 @@ Aggregation DataArray.min DataArray.mean DataArray.median + DataArray.nunique DataArray.prod DataArray.sum DataArray.std diff --git a/doc/api/dataset.rst b/doc/api/dataset.rst index 733c9768d2f..0c8e1e49679 100644 --- a/doc/api/dataset.rst +++ b/doc/api/dataset.rst @@ -169,6 +169,7 @@ Aggregation Dataset.min Dataset.mean Dataset.median + Dataset.nunique Dataset.prod Dataset.sum Dataset.std diff --git a/doc/api/datatree.rst b/doc/api/datatree.rst index 8501440b7d7..487e47c5927 100644 --- a/doc/api/datatree.rst +++ b/doc/api/datatree.rst @@ -266,6 +266,7 @@ Aggregate data in all nodes in the subtree simultaneously. DataTree.min DataTree.mean DataTree.median + DataTree.nunique DataTree.prod DataTree.sum DataTree.std diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 677b2194a55..fe3b4f58cc0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,10 @@ New Features - :py:func:`combine_nested` now support :py:class:`DataTree` objects (:pull:`10849`). By `Stephan Hoyer `_. +- Add :py:func:`nunique` reduction function (:issue:`9548`), which behaves like + :py:func:`pandas.DataFrame.nunique` applied along specific dimensions. + By `Ewan Short `_. + Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/_aggregations.py b/xarray/core/_aggregations.py index adc064840de..a04bcfbb88e 100644 --- a/xarray/core/_aggregations.py +++ b/xarray/core/_aggregations.py @@ -513,7 +513,7 @@ def mean( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -618,7 +618,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -733,7 +733,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -845,7 +845,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -957,7 +957,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -1065,7 +1065,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -1116,6 +1116,124 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Self: + """ + Reduce this DataTree's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataTree + New DataTree with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + DataArray.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> dt = xr.DataTree( + ... xr.Dataset( + ... data_vars=dict(foo=("time", np.array([1, 2, 3, 0, 2, np.nan]))), + ... coords=dict( + ... time=( + ... "time", + ... pd.date_range("2001-01-01", freq="ME", periods=6), + ... ), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ), + ... ) + >>> dt + + Group: / + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> dt.nunique() + + Group: / + Dimensions: () + Data variables: + foo int64 8B 5 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> dt.nunique(skipna=False) + + Group: / + Dimensions: () + Data variables: + foo int64 8B 5 + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> dt.nunique(skipna=False, equalna=False) + + Group: / + Dimensions: () + Data variables: + foo int64 8B 5 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + numeric_only=False, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -1164,7 +1282,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -1269,7 +1387,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -1776,6 +1894,10 @@ def mean( :ref:`agg` User guide on reduction or aggregation operations. + Notes + ----- + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -1872,7 +1994,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -1979,7 +2101,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2083,7 +2205,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2187,7 +2309,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2287,7 +2409,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2332,6 +2454,116 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Self: + """ + Reduce this Dataset's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : Dataset + New Dataset with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + DataArray.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> ds = xr.Dataset(dict(da=da)) + >>> ds + Size: 120B + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> ds.nunique() + Size: 8B + Dimensions: () + Data variables: + da int64 8B 5 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> ds.nunique(skipna=False) + Size: 8B + Dimensions: () + Data variables: + da int64 8B 5 + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> ds.nunique(skipna=False, equalna=False) + Size: 8B + Dimensions: () + Data variables: + da int64 8B 5 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + numeric_only=False, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -2379,7 +2611,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -2477,7 +2709,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -2944,6 +3176,10 @@ def mean( :ref:`agg` User guide on reduction or aggregation operations. + Notes + ----- + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -3032,7 +3268,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3129,7 +3365,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3223,7 +3459,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3317,7 +3553,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3407,7 +3643,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3444,6 +3680,106 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Self: + """ + Reduce this DataArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataArray + New DataArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> da + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> da.nunique() + Size: 8B + array(5) + + Use ``skipna`` to control whether NaNs are ignored. + + >>> da.nunique(skipna=False) + Size: 8B + array(5) + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> da.nunique(skipna=False, equalna=False) + Size: 8B + array(5) + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -3491,7 +3827,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -3585,7 +3921,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -4223,6 +4559,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -4344,7 +4682,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4479,7 +4817,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4611,7 +4949,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4743,7 +5081,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4871,7 +5209,7 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4920,6 +5258,128 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Dataset: + """ + Reduce this Dataset's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the GroupBy dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : Dataset + New Dataset with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + :ref:`groupby` + User guide on groupby operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up groupby computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> ds = xr.Dataset(dict(da=da)) + >>> ds + Size: 120B + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> ds.groupby("labels").nunique() + Size: 48B + Dimensions: (labels: 3) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + Data variables: + da (labels) int64 24B 2 1 2 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> ds.groupby("labels").nunique(skipna=False) + Size: 48B + Dimensions: (labels: 3) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + Data variables: + da (labels) int64 24B 2 1 2 + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> ds.groupby("labels").nunique(skipna=False, equalna=False) + Size: 48B + Dimensions: (labels: 3) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + Data variables: + da (labels) int64 24B 2 1 2 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + numeric_only=False, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -4973,7 +5433,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -5077,7 +5537,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -5719,6 +6179,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -5840,7 +6302,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -5975,7 +6437,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -6107,7 +6569,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -6206,29 +6668,157 @@ def var( skips missing values for float dtypes; other dtypes either do not have a sentinel missing value (int) or ``skipna=True`` has not been implemented (object, datetime64 or timedelta64). - ddof : int, default: 0 - “Delta Degrees of Freedom”: the divisor used in the calculation is ``N - ddof``, - where ``N`` represents the number of elements. + ddof : int, default: 0 + “Delta Degrees of Freedom”: the divisor used in the calculation is ``N - ddof``, + where ``N`` represents the number of elements. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``var`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : Dataset + New Dataset with ``var`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + numpy.var + dask.array.var + Dataset.var + :ref:`resampling` + User guide on resampling operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up resampling computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> ds = xr.Dataset(dict(da=da)) + >>> ds + Size: 120B + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> ds.resample(time="3ME").var() + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) float64 24B 0.0 1.556 0.0 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> ds.resample(time="3ME").var(skipna=False) + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) float64 24B 0.0 1.556 nan + + Specify ``ddof=1`` for an unbiased estimate. + + >>> ds.resample(time="3ME").var(skipna=True, ddof=1) + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) float64 24B nan 2.333 nan + """ + if ( + flox_available + and OPTIONS["use_flox"] + and contains_only_chunked_or_numpy(self._obj) + ): + return self._flox_reduce( + func="var", + dim=dim, + skipna=skipna, + ddof=ddof, + numeric_only=True, + # fill_value=fill_value, + keep_attrs=keep_attrs, + **kwargs, + ) + else: + return self.reduce( + duck_array_ops.var, + dim=dim, + skipna=skipna, + ddof=ddof, + numeric_only=True, + keep_attrs=keep_attrs, + **kwargs, + ) + + def median( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Dataset: + """ + Reduce this Dataset's data by applying ``median`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``median``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the Resample dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). keep_attrs : bool or None, optional If True, ``attrs`` will be copied from the original object to the new one. If False, the new object will be returned without attributes. **kwargs : Any Additional keyword arguments passed on to the appropriate array - function for calculating ``var`` on this object's data. + function for calculating ``median`` on this object's data. These could include dask-specific kwargs like ``split_every``. Returns ------- reduced : Dataset - New Dataset with ``var`` applied to its data and the + New Dataset with ``median`` applied to its data and the indicated dimension(s) removed See Also -------- - numpy.var - dask.array.var - Dataset.var + numpy.median + dask.array.median + Dataset.median :ref:`resampling` User guide on resampling operations. @@ -6239,7 +6829,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -6261,75 +6851,49 @@ def var( Data variables: da (time) float64 48B 1.0 2.0 3.0 0.0 2.0 nan - >>> ds.resample(time="3ME").var() + >>> ds.resample(time="3ME").median() Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B 0.0 1.556 0.0 + da (time) float64 24B 1.0 2.0 2.0 Use ``skipna`` to control whether NaNs are ignored. - >>> ds.resample(time="3ME").var(skipna=False) - Size: 48B - Dimensions: (time: 3) - Coordinates: - * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 - Data variables: - da (time) float64 24B 0.0 1.556 nan - - Specify ``ddof=1`` for an unbiased estimate. - - >>> ds.resample(time="3ME").var(skipna=True, ddof=1) + >>> ds.resample(time="3ME").median(skipna=False) Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B nan 2.333 nan + da (time) float64 24B 1.0 2.0 nan """ - if ( - flox_available - and OPTIONS["use_flox"] - and contains_only_chunked_or_numpy(self._obj) - ): - return self._flox_reduce( - func="var", - dim=dim, - skipna=skipna, - ddof=ddof, - numeric_only=True, - # fill_value=fill_value, - keep_attrs=keep_attrs, - **kwargs, - ) - else: - return self.reduce( - duck_array_ops.var, - dim=dim, - skipna=skipna, - ddof=ddof, - numeric_only=True, - keep_attrs=keep_attrs, - **kwargs, - ) + return self.reduce( + duck_array_ops.median, + dim=dim, + skipna=skipna, + numeric_only=True, + keep_attrs=keep_attrs, + **kwargs, + ) - def median( + def nunique( self, dim: Dims = None, *, skipna: bool | None = None, + equalna: bool | None = True, keep_attrs: bool | None = None, **kwargs: Any, ) -> Dataset: """ - Reduce this Dataset's data by applying ``median`` along some dimension(s). + Reduce this Dataset's data by applying ``nunique`` along some dimension(s). Parameters ---------- dim : str, Iterable of Hashable, "..." or None, default: None - Name of dimension[s] along which to apply ``median``. For e.g. ``dim="x"`` + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` or ``dim=["x", "y"]``. If None, will reduce over the Resample dimensions. If "...", will reduce over all dimensions. skipna : bool or None, optional @@ -6337,26 +6901,30 @@ def median( skips missing values for float dtypes; other dtypes either do not have a sentinel missing value (int) or ``skipna=True`` has not been implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. keep_attrs : bool or None, optional If True, ``attrs`` will be copied from the original object to the new one. If False, the new object will be returned without attributes. **kwargs : Any Additional keyword arguments passed on to the appropriate array - function for calculating ``median`` on this object's data. + function for calculating ``nunique`` on this object's data. These could include dask-specific kwargs like ``split_every``. Returns ------- reduced : Dataset - New Dataset with ``median`` applied to its data and the + New Dataset with ``nunique`` applied to its data and the indicated dimension(s) removed See Also -------- - numpy.median - dask.array.median - Dataset.median + pandas.DataFrame.nunique + Dataset.nunique :ref:`resampling` User guide on resampling operations. @@ -6367,7 +6935,12 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. Examples -------- @@ -6389,29 +6962,40 @@ def median( Data variables: da (time) float64 48B 1.0 2.0 3.0 0.0 2.0 nan - >>> ds.resample(time="3ME").median() + >>> ds.resample(time="3ME").nunique() Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B 1.0 2.0 2.0 + da (time) int64 24B 1 3 2 Use ``skipna`` to control whether NaNs are ignored. - >>> ds.resample(time="3ME").median(skipna=False) + >>> ds.resample(time="3ME").nunique(skipna=False) Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B 1.0 2.0 nan + da (time) int64 24B 1 3 2 + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> ds.resample(time="3ME").nunique(skipna=False, equalna=False) + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) int64 24B 1 3 2 """ return self.reduce( - duck_array_ops.median, + duck_array_ops.nunique, dim=dim, skipna=skipna, - numeric_only=True, + equalna=equalna, + numeric_only=False, keep_attrs=keep_attrs, **kwargs, ) @@ -6469,7 +7053,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -6573,7 +7157,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -7176,6 +7760,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -7288,7 +7874,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7412,7 +7998,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7533,7 +8119,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7654,7 +8240,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7771,7 +8357,7 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7812,6 +8398,118 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> DataArray: + """ + Reduce this DataArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the GroupBy dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataArray + New DataArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + DataArray.nunique + :ref:`groupby` + User guide on groupby operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up groupby computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> da + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> da.groupby("labels").nunique() + Size: 24B + array([2, 1, 2]) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + + Use ``skipna`` to control whether NaNs are ignored. + + >>> da.groupby("labels").nunique(skipna=False) + Size: 24B + array([2, 1, 2]) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> da.groupby("labels").nunique(skipna=False, equalna=False) + Size: 24B + array([2, 1, 2]) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -7865,7 +8563,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -7965,7 +8663,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -8564,6 +9262,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -8676,7 +9376,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -8800,7 +9500,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -8921,7 +9621,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -9042,7 +9742,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -9159,7 +9859,7 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -9200,6 +9900,118 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> DataArray: + """ + Reduce this DataArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the Resample dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataArray + New DataArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + DataArray.nunique + :ref:`resampling` + User guide on resampling operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up resampling computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> da + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> da.resample(time="3ME").nunique() + Size: 24B + array([1, 3, 2]) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> da.resample(time="3ME").nunique(skipna=False) + Size: 24B + array([1, 3, 2]) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> da.resample(time="3ME").nunique(skipna=False, equalna=False) + Size: 24B + array([1, 3, 2]) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -9253,7 +10065,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -9353,7 +10165,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index b8a4011a72e..bf568dd84b6 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,6 +13,7 @@ from collections.abc import Callable from functools import partial from importlib import import_module +from itertools import product from typing import Any import numpy as np @@ -276,10 +277,14 @@ def as_shared_dtype(scalars_or_arrays, xp=None): isinstance(x, type(extension_array_types[0])) for x in extension_array_types ): return [ - x - if not isna(x) - else PandasExtensionArray( - type(non_nans[0].array)._from_sequence([x], dtype=non_nans[0].dtype) + ( + x + if not isna(x) + else PandasExtensionArray( + type(non_nans[0].array)._from_sequence( + [x], dtype=non_nans[0].dtype + ) + ) ) for x in scalars_or_arrays ] @@ -386,6 +391,187 @@ def count(data, axis=None): return xp.sum(xp.logical_not(isnull(data)), axis=axis) +def _dask_nunique(data): + """Helper function to get nunique on dask arrays. Assumes reduction axis is -1.""" + import dask + + xp = get_array_namespace(data) + + # To track unique elements across chunks we will use an object array containing + # variable length xp arrays. The idea is that we collect the sorted uniques for each chunk + # as we go, speeding up subsequent concatenation and sorting. Another option might + # be to use a fixed length (masked) sparse array with an extra dimension, but such + # an array would likely use more memory, and the sort and concatenation steps + # would likely be slower. + def _build_storage_array(shape): + size = np.array(shape).prod() + storage_array = xp.empty(size, dtype=object) + # Assign empty arrays to each element + for i in range(size): + storage_array[i] = xp.array([], dtype=data.dtype) + # Reshape to the desired grid shape + return storage_array.reshape(shape) + + # We're going to use dask reduction, so define chunk, combine, aggregate functions. + def chunk_uniques(chunk, axis=None, keepdims=False): + """ + Get the unique values along the required axis for a chunk. Adapt the approach + described at https://stackoverflow.com/questions/46893369 + """ + if data.ndim == 1: + uniques = xp.empty([1], dtype=object) + uniques[0] = _get_uniques_1d(chunk) + return uniques + chunk = xp.sort(chunk, axis=-1) + if chunk.shape[-1] == 1: + uniques_bool = xp.ones_like(chunk, dtype=bool) + else: + uniques_bool = xp.not_equal(chunk[..., :-1], chunk[..., 1:]) + # Pad start with true as first element always unique + pad = xp.ones_like(uniques_bool[..., :1], dtype=bool) + uniques_bool = xp.concatenate([pad, uniques_bool], axis=-1) + # Store the uniques in an object array so we can use a dask reduction + uniques = _build_storage_array(chunk.shape[:-1]) + + for idx in product(*[range(s) for s in chunk.shape[:-1]]): + uniques[idx] = chunk[idx][uniques_bool[idx]] + return uniques + + def _get_uniques_1d(array): + # Use the same vectorized style to get uniques from 1d array. + if len(array) < 2: + return array + array = xp.sort(array) + uniques_bool = xp.not_equal(array[:-1], array[1:]) + # Pad start with true as first element always unique + uniques_bool = xp.concatenate([xp.array([True]), uniques_bool]) + return array[uniques_bool] + + # Sometimes combine will return nested lists of arrays, so we need a flattener. + def _flatten_to_arrays(nested): + result = [] + + def append_arrays(x): + if isinstance(x, np.ndarray): + result.append(x) + elif isinstance(x, (list, tuple)): + for y in x: + append_arrays(y) + else: + raise ValueError(f"Unexpected type in nested structure: {type(x)}") + + append_arrays(nested) + return result + + def _merge_unique_arrays(arrays_input): + # Sometimes combine will return nested lists of arrays, so flatten first + arrays = _flatten_to_arrays(arrays_input) + # If single array, return it + if len(arrays) == 1: + return arrays[0] + # Merge multiple arrays + result = _build_storage_array(arrays[0].shape) + for idx in product(*[range(s) for s in result.shape]): + combined_vals = xp.concatenate([arr[idx] for arr in arrays], axis=0) + result[idx] = _get_uniques_1d(combined_vals) + return result + + def combine_uniques(uniques_list, axis=None, keepdims=False): + return _merge_unique_arrays(uniques_list) + + def aggregate_uniques(combined, axis=None, keepdims=False): + # First flatten and merge final list + combined = _flatten_to_arrays(combined) + combined = _merge_unique_arrays(combined) + unique_counts = _build_storage_array(combined.shape) + for idx in product(*[range(s) for s in combined.shape]): + unique_counts[idx] = len(combined[idx]) + return unique_counts + + meta_shape = (0,) * (data.ndim - 1) + meta_array = xp.empty(meta_shape, dtype=object) + + return dask.array.reduction( + data, + chunk=chunk_uniques, + combine=combine_uniques, + aggregate=aggregate_uniques, + dtype=object, + concatenate=False, + meta=meta_array, + axis=-1, + keepdims=False, + ) + + +def _factorize(data): + """Helper function for nunique to factorize mixed type arrays to float.""" + if not isinstance(data, np.ndarray): + message = "nunique with object dtype only implemented for np.ndarray." + raise NotImplementedError(message) + data = pd.factorize(data.reshape(-1))[0].reshape(data.shape) + data = data.astype(float) + data[data == -1] = np.nan + return data + + +def _permute_dims(data, axes): + """Helper function to get a suitable permute dims function.""" + xp = get_array_namespace(data) + if hasattr(xp, "permute_dims"): + return xp.permute_dims(data, axes) + elif hasattr(xp, "transpose"): + return xp.transpose(data, axes) + else: + raise NotImplementedError(f"Unknown transpose method for namespace {xp}") + + +def nunique(data, axis=None, skipna=True, equalna=True): + """ + Count the number of unique values in this array along the given dimensions + """ + xp = get_array_namespace(data) + + if axis is None: + axis = list(range(data.ndim)) + elif isinstance(axis, (int, tuple)): + axis = [axis] if isinstance(axis, int) else list(axis) + if not axis: + # Return unchanged so downstream aggregation functions work as expected. + return data + # Normalize negative axes + axis = [ax % data.ndim for ax in axis] + shape = data.shape + + # If mixed type array, convert to float first + if is_duck_array(data) and data.dtype == np.object_: + data = _factorize(data) + + # Move axes to be aggregated to the end and stack + new_order = [i for i in range(len(shape)) if i not in axis] + axis + new_shape = [s for i, s in enumerate(shape) if i not in axis] + [-1] + data = xp.reshape(_permute_dims(data, new_order), new_shape) + + if is_duck_dask_array(data): + unique_counts = _dask_nunique(data) + else: + # If not using dask, get counts using the approach described at + # https://stackoverflow.com/questions/46893369 + sorted_data = xp.sort(data, axis=-1) + unique_counts = xp.not_equal(sorted_data[..., :-1], sorted_data[..., 1:]) + unique_counts = xp.sum(unique_counts, axis=-1) + 1 + + # Subtract of na values as required + if skipna or (not skipna and equalna): + na_counts = isnull(data).astype(int) + na_counts = xp.sum(na_counts, axis=-1) + if not skipna and equalna: + na_counts = xp.clip(na_counts - 1, 0, None) + unique_counts = unique_counts - na_counts + + return unique_counts + + def sum_where(data, axis=None, dtype=None, where=None): xp = get_array_namespace(data) if where is not None: diff --git a/xarray/namedarray/_aggregations.py b/xarray/namedarray/_aggregations.py index c5726ef9251..fa564ebded2 100644 --- a/xarray/namedarray/_aggregations.py +++ b/xarray/namedarray/_aggregations.py @@ -1,5 +1,4 @@ """Mixin classes with reduction operations.""" - # This file was generated using xarray.util.generate_aggregations. Do not edit manually. from __future__ import annotations @@ -352,6 +351,10 @@ def mean( :ref:`agg` User guide on reduction or aggregation operations. + Notes + ----- + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> from xarray.namedarray.core import NamedArray @@ -426,7 +429,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -509,7 +512,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -589,7 +592,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -669,7 +672,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -745,7 +748,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -772,6 +775,92 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equalna: bool | None = True, + **kwargs: Any, + ) -> Self: + """ + Reduce this NamedArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : NamedArray + New NamedArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + DataArray.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + Note that identifying unique values on very large + arrays is slow and memory intensive when there are many unique values. + For such arrays, consider lowering the precision, e.g. rounding floats + then converting them to integers, before searching for unique values. + For dask arrays, performance is improved when chunksizes are largest on + the dimension(s) being reduced. + + Examples + -------- + >>> from xarray.namedarray.core import NamedArray + >>> na = NamedArray("x", np.array([1, 2, 3, 0, 2, np.nan])) + >>> na + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + + >>> na.nunique() + Size: 8B + array(5) + + Use ``skipna`` to control whether NaNs are ignored. + + >>> na.nunique(skipna=False) + Size: 8B + array(5) + + Use ``equalna`` to control whether NaNs are counted as distinct values. + + >>> na.nunique(skipna=False, equalna=False) + Size: 8B + array(5) + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equalna=equalna, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -815,7 +904,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -889,7 +978,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5eec7b8a2fd..d0bab53330d 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4916,6 +4916,45 @@ def line(x, a, b): assert_allclose(fit.curvefit_coefficients, expected) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("dim", ["c", None, ("b", "c")]) + def test_nunique(self, skipna, dim): + x = np.array( + [ + [ + [np.nan, np.nan, 2.0, np.nan], + [np.nan, 5.0, 6.0, np.nan], + [8.0, 9.0, 10.0, np.nan], + ], + [ + [np.nan, 13.0, 14.0, 15.0], + [np.nan, 17.0, 18.0, np.nan], + [np.nan, 21.0, np.nan, np.nan], + ], + ] + ) + coords = { + "a": range(x.shape[0]), + "b": range(x.shape[1]), + "c": range(x.shape[2]), + } + da = DataArray(x, coords=coords) + + coords_1 = {"a": range(x.shape[0]), "b": range(x.shape[1])} + coords_3 = {"a": range(x.shape[0])} + + expected_results = { + (True, "c"): DataArray([[1, 2, 3], [3, 2, 1]], coords=coords_1), + (True, None): DataArray(12), + (True, ("b", "c")): DataArray([6, 6], coords=coords_3), + (False, "c"): DataArray([[2, 3, 4], [4, 3, 2]], coords=coords_1), + (False, None): DataArray(13), + (False, ("b", "c")): DataArray([7, 7], coords=coords_3), + } + + result = da.nunique(dim=dim, skipna=skipna) + assert_identical(result, expected_results[(skipna, dim)]) + class TestReduce: @pytest.fixture(autouse=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e677430dfbf..209920b8f0d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -7482,6 +7482,67 @@ def test_query(self, backend, engine, parser) -> None: # pytest tests — new tests should go here, rather than in the class. +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("dim", [("c", "dim_0", "dim_1"), None, ("a", "b")]) +def test_nunique(skipna, dim): + # Create test data + x = np.array( + [ + [ + [np.nan, np.nan, 2.0, np.nan], + [np.nan, 5.0, 6.0, np.nan], + [8.0, 9.0, 10.0, np.nan], + ], + [ + [np.nan, 13.0, 14.0, 15.0], + [np.nan, 17.0, 18.0, np.nan], + [np.nan, 21.0, np.nan, np.nan], + ], + ] + ) + coords = {"a": range(x.shape[0]), "b": range(x.shape[1]), "c": range(x.shape[2])} + da_1 = DataArray(x, coords=coords) + da_2 = DataArray(x) + ds = Dataset({"da_1": da_1, "da_2": da_2}) + + # Specify the coordinates and arrays we expect for each test case + coords_1 = {"a": range(x.shape[0]), "b": range(x.shape[1])} + coords_3 = {"c": range(x.shape[2])} + arr_1 = np.array([[1, 2, 3], [3, 2, 1]]) + arr_3 = np.array([1, 5, 5, 1]) + expected_results = { + (True, ("c", "dim_0", "dim_1")): (arr_1, coords_1, arr_3, ["dim_2"]), + (True, None): (12, None, 12, None), + (True, ("a", "b")): (arr_3, coords_3, x, None), + (False, ("c", "dim_0", "dim_1")): (arr_1 + 1, coords_1, arr_3 + 1, ["dim_2"]), + (False, None): (13, None, 13, None), + (False, ("a", "b")): (arr_3 + 1, coords_3, x, None), + } + + # Get the expected result for the current parameters + expected_result = expected_results[(skipna, dim)] + expected_ds = Dataset( + { + "da_1": DataArray(expected_result[0], coords=expected_result[1]), + "da_2": DataArray(expected_result[2], dims=expected_result[3]), + } + ) + + # Get the actual result and compare + result = ds.nunique(dim=dim, skipna=skipna) + assert_identical(result, expected_ds) + + +@pytest.mark.parametrize("skipna", [True, False]) +def test_nunique_pandas(skipna): + get_col = lambda: np.random.randint(0, 100, size=100) + get_da = lambda: xr.DataArray(get_col(), coords={"x": np.arange(100)}) + ds = xr.Dataset({"a": get_da(), "b": get_da(), "c": get_da(), "d": get_da()}) + xr_result = ds.nunique(skipna=skipna).to_array().values + pd_result = ds.to_dataframe().nunique(dropna=skipna).values + assert_array_equal(xr_result, pd_result) + + @pytest.mark.parametrize("parser", ["pandas", "python"]) def test_eval(ds, parser) -> None: """Currently much more minimal testing that `query` above, and much of the setup diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 0cd888f5782..21e1a6e0435 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2312,6 +2312,18 @@ def test_subtree(self) -> None: actual = tree.children["child"].mean() assert_identical(expected, actual) + def test_nunique(self) -> None: + arr = np.array([[1, 2, 2], [3, 3, 3]]) + da = xr.DataArray(arr, coords={"x": [0, 1], "y": [0, 1, 2]}) + ds = xr.Dataset({"a": da}) + dt = DataTree.from_dict({"root": ds, "root/child": 2 * ds}) + expected_da = xr.DataArray(np.array([2, 1]), coords={"x": [0, 1]}) + expected_ds = xr.Dataset({"a": expected_da}) + expected_dt = DataTree.from_dict( + {"root": expected_ds, "root/child": expected_ds} + ) + assert_identical(expected_dt, dt.nunique(dim="y")) + class TestOps: def test_unary_op(self) -> None: diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 83c7c2bb207..4390e5d17c8 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -24,6 +24,7 @@ least_squares, mean, np_timedelta64_to_float, + nunique, pd_timedelta_to_float, push, py_timedelta_to_float, @@ -165,6 +166,45 @@ def test_count(self): assert 1 == count(np.datetime64("2000-01-01")) + @pytest.mark.parametrize("equalna", [True, False]) + @pytest.mark.parametrize("mixed_type", [True, False]) + @pytest.mark.parametrize("string_array", [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("axis", [2, None, (1, 2)]) + def test_nunique(self, axis, skipna, equalna, string_array, mixed_type): + expected_results = { + (True, True, 2): np.array([[1, 2, 3], [3, 2, 1]]), + (True, True, None): np.array(12), + (True, True, (1, 2)): np.array([6, 6]), + (True, False, 2): np.array([[2, 3, 4], [4, 3, 2]]), + (True, False, None): np.array(13), + (True, False, (1, 2)): np.array([7, 7]), + (False, True, 2): np.array([[1, 2, 3], [3, 2, 1]]), + (False, True, None): np.array(12), + (False, True, (1, 2)): np.array([6, 6]), + (False, False, 2): np.array([[4, 4, 4], [4, 4, 4]]), + (False, False, None): np.array(24), + (False, False, (1, 2)): np.array([12, 12]), + } + x = self.x.copy() + if string_array: + # Convert to str + x = x.astype(str) + # Convert to object and put nans back in + x = x.astype(object) + x[x == "nan"] = np.nan + if mixed_type: + x = x.astype(object) + x[(x == 10.0) | (x == "10.0")] = True + x[(x == 2.0) | (x == "2.0")] = np.sum + # Object arrays currently only supported for np.ndarray + if (mixed_type or string_array) and not isinstance(x, np.ndarray): + with pytest.raises(NotImplementedError): + nunique(x, axis=axis, skipna=skipna, equalna=equalna) + return + result = nunique(x, axis=axis, skipna=skipna, equalna=equalna) + assert_array_equal(result, expected_results[(equalna, skipna, axis)]) + def test_where_type_promotion(self): result = where(np.array([True, False]), np.array([1, 2]), np.array(["a", "b"])) assert_array_equal(result, np.array([1, "b"], dtype=object)) @@ -263,6 +303,10 @@ def setUp(self): chunks=(2, 1, 2), ) + def test_nunique_dask_lazy(self): + with raise_if_dask_computes(): + nunique(self.x, axis=0) + def test_cumsum_1d(): inputs = np.array([0, 1, 2, 3]) diff --git a/xarray/util/generate_aggregations.py b/xarray/util/generate_aggregations.py index e386b96f63d..562c5826557 100644 --- a/xarray/util/generate_aggregations.py +++ b/xarray/util/generate_aggregations.py @@ -194,6 +194,12 @@ def {method}( have a sentinel missing value (int) or ``skipna=True`` has not been implemented (object, datetime64 or timedelta64).""" +_EQUALNA_DOCSTRING = """equalna : bool or None, default: True + If ``skipna == False``, ``equalna`` determines whether null values + are counted as distinct values or not. Set ``equalna = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equalna = False`` + for consistency with the `Python array API `_.""" + _MINCOUNT_DOCSTRING = """min_count : int or None, optional The required number of valid values to perform the operation. If fewer than min_count non-NA values are present the result will be @@ -226,6 +232,12 @@ def {method}( _CUM_NOTES = """Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated in the future.""" +_NUNIQUE_NOTES = """Note that identifying unique values on very large +arrays is slow and memory intensive when there are many unique values. +For such arrays, consider lowering the precision, e.g. rounding floats +then converting them to integers, before searching for unique values. +For dask arrays, performance is improved when chunksizes are largest on +the dimension(s) being reduced.""" class ExtraKwarg(NamedTuple): @@ -239,28 +251,45 @@ class ExtraKwarg(NamedTuple): docs=_SKIPNA_DOCSTRING, kwarg="skipna: bool | None = None,", call="skipna=skipna,", - example="""\n - Use ``skipna`` to control whether NaNs are ignored. - - >>> {calculation}(skipna=False)""", + example=( + "\n \n" + " Use ``skipna`` to control whether NaNs are ignored.\n" + " \n" + " >>> {calculation}(skipna=False)" + ), +) +equalna = ExtraKwarg( + docs=_EQUALNA_DOCSTRING, + kwarg="equalna: bool | None = True,", + call="equalna=equalna,", + example=( + "\n \n" + " Use ``equalna`` to control whether NaNs are counted as distinct values.\n" + " \n" + " >>> {calculation}(skipna=False, equalna=False)" + ), ) min_count = ExtraKwarg( docs=_MINCOUNT_DOCSTRING, kwarg="min_count: int | None = None,", call="min_count=min_count,", - example="""\n - Specify ``min_count`` for finer control over when NaNs are ignored. - - >>> {calculation}(skipna=True, min_count=2)""", + example=( + "\n \n" + " Specify ``min_count`` for finer control over when NaNs are ignored.\n" + " \n" + " >>> {calculation}(skipna=True, min_count=2)" + ), ) ddof = ExtraKwarg( docs=_DDOF_DOCSTRING, kwarg="ddof: int = 0,", call="ddof=ddof,", - example="""\n - Specify ``ddof=1`` for an unbiased estimate. - - >>> {calculation}(skipna=True, ddof=1)""", + example=( + "\n \n" + " Specify ``ddof=1`` for an unbiased estimate.\n" + " \n" + " >>> {calculation}(skipna=True, ddof=1)" + ), ) @@ -424,11 +453,11 @@ def generate_example(self, method): else: extra_examples = "" + blank_line = 8 * " " return f""" Examples --------{created} - >>> {self.datastructure.example_var_name} - + >>> {self.datastructure.example_var_name}\n{blank_line} >>> {calculation}(){extra_examples}""" @@ -444,7 +473,12 @@ def generate_code(self, method, has_keep_attrs): # median isn't enabled yet, because it would break if a single group was present in multiple # chunks. The non-flox code path will just rechunk every group to a single chunk and execute the median - method_is_not_flox_supported = method.name in ("median", "cumsum", "cumprod") + method_is_not_flox_supported = method.name in ( + "median", + "cumsum", + "cumprod", + "nunique", + ) if method_is_not_flox_supported: indent = 12 else: @@ -530,6 +564,12 @@ def generate_code(self, method, has_keep_attrs): Method( "median", extra_kwargs=(skipna,), numeric_only=True, min_flox_version="0.9.2" ), + Method( + "nunique", + extra_kwargs=(skipna, equalna), + see_also_modules=("pandas.DataFrame",), + additional_notes=_NUNIQUE_NOTES, + ), # Cumulatives: Method( "cumsum",