diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bebd928924214..3f37610a11064 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1192,6 +1192,7 @@ Indexing - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`) +- Bug in :meth:`Series.mask` unexpectedly filling ``pd.NA`` (:issue:`60729`) - Bug in indexing ``obj.loc[start:stop]`` with a :class:`DatetimeIndex` and :class:`Timestamp` endpoints with higher resolution than the index (:issue:`63262`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) - Bug in reindexing of :class:`DataFrame` with :class:`PeriodDtype` columns in case of consolidated block (:issue:`60980`, :issue:`60273`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 79f682988a148..8e0bfe56131c5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9631,6 +9631,7 @@ def _where( # align the cond to same shape as myself cond = common.apply_if_callable(cond, self) if isinstance(cond, NDFrame): + cond = cond.fillna(True) # CoW: Make sure reference is not kept alive if cond.ndim == 1 and self.ndim == 2: cond = cond._constructor_expanddim( @@ -9645,6 +9646,7 @@ def _where( if cond.shape != self.shape: raise ValueError("Array conditional must be same shape as self") cond = self._constructor(cond, **self._construct_axes_dict(), copy=False) + cond = cond.fillna(True) # make sure we are boolean fill_value = bool(inplace) @@ -9992,6 +9994,18 @@ def mask( if not hasattr(cond, "__invert__"): cond = np.array(cond) + # GH 60772 + na_msg = "Cannot mask with non-boolean array containing NA / NaN values" + if isinstance(cond, np.ndarray): + if not lib.is_bool_array(cond): + raise ValueError(na_msg) + elif isinstance(cond, ABCDataFrame): + if not all(is_bool_dtype(blk.dtype) for blk in cond._mgr.blocks): + raise ValueError(na_msg) + elif isinstance(cond, ABCSeries): + if not is_bool_dtype(cond): + raise ValueError(na_msg) + return self._where( ~cond, other=other, diff --git a/pandas/tests/series/indexing/test_mask.py b/pandas/tests/series/indexing/test_mask.py index 3c21cd0d5ca64..75c44e6d56399 100644 --- a/pandas/tests/series/indexing/test_mask.py +++ b/pandas/tests/series/indexing/test_mask.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Series import pandas._testing as tm @@ -67,3 +69,31 @@ def test_mask_inplace(): rs = s.copy() rs.mask(cond, -s, inplace=True) tm.assert_series_equal(rs, s.mask(cond, -s)) + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +@pytest.mark.parametrize("cond_type", [["series", "list", "numpy"]]) +def test_mask_na(dtype, cond_type): + # We should not be filling pd.NA. See GH#60729 + series = Series([None, 1, 2, None, 3, 4, None], dtype=dtype) + cond = series <= 2 + + if cond_type == "list": + cond = cond.to_list() + elif cond_type == "numpy": + cond = cond.to_numpy() + + if isinstance(cond, Series): + result = series.mask(cond, -99) + expected = Series([None, -99, -99, None, 3, 4, None], dtype=dtype) + tm.assert_series_equal(result, expected) + else: + msg = "Cannot mask with non-boolean array containing NA / NaN values" + with pytest.raises(ValueError, match=msg): + series.mask(cond) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index ec8c15714bf82..52bebfe6a2520 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -445,3 +447,25 @@ def test_where_datetimelike_categorical(tz_naive_fixture): res = pd.DataFrame(lvals).where(mask[:, None], pd.DataFrame(rvals)) tm.assert_frame_equal(res, pd.DataFrame(dr)) + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +@pytest.mark.parametrize("cond_type", [["series", "list", "numpy"]]) +def test_where_na(dtype, cond_type): + series = Series([None, 1, 2, None, 3, 4, None], dtype=dtype) + expected = Series([None, 1, 2, None, -99, -99, None], dtype=dtype) + cond = series <= 2 + + if cond_type == "list": + cond = cond.to_list() + elif cond_type == "numpy": + cond = cond.to_numpy() + + result = series.where(cond, -99) + tm.assert_series_equal(result, expected)