Skip to content

Fix str dtype -> IntegerDtype conversions #43949

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ Conversion
^^^^^^^^^^
- Bug in :class:`UInt64Index` constructor when passing a list containing both positive integers small enough to cast to int64 and integers too large too hold in int64 (:issue:`42201`)
- Bug in :class:`Series` constructor returning 0 for missing values with dtype ``int64`` and ``False`` for dtype ``bool`` (:issue:`43017`, :issue:`43018`)
- Bug in :class:`IntegerDtype` not allowing coercion from string dtype (:issue:`25472`)
- Bug in :func:`to_datetime` with ``arg:xr.DataArray`` and ``unit="ns"`` specified raises TypeError (:issue:`44053`)
-

Expand Down
15 changes: 10 additions & 5 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
is_integer_dtype,
is_list_like,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -124,12 +125,10 @@ def safe_cast(values, dtype, copy: bool):
Safely cast the values to the dtype if they
are equivalent, meaning floats must be equivalent to the
ints.

"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:

casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
Expand All @@ -143,7 +142,7 @@ def coerce_to_array(
values, dtype, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask
Coerce the input values array to numpy arrays with a mask.

Parameters
----------
Expand Down Expand Up @@ -187,7 +186,8 @@ def coerce_to_array(
return values, mask

values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = None
if is_object_dtype(values) or is_string_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
Expand All @@ -198,6 +198,8 @@ def coerce_to_array(
"mixed-integer",
"integer-na",
"mixed-integer-float",
"string",
"unicode",
]:
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")

Expand Down Expand Up @@ -230,7 +232,10 @@ def coerce_to_array(
if mask.any():
values = values.copy()
values[mask] = 1
values = safe_cast(values, dtype, copy=False)
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into an int
values = values.astype(dtype, copy=copy)
else:
values = safe_cast(values, dtype, copy=False)

Expand Down
14 changes: 7 additions & 7 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,14 @@ def ensure_python_int(value: int | np.integer) -> int:


def classes(*klasses) -> Callable:
"""evaluate if the tipo is a subclass of the klasses"""
"""Evaluate if the tipo is a subclass of the klasses."""
return lambda tipo: issubclass(tipo, klasses)


def classes_and_not_datetimelike(*klasses) -> Callable:
"""
evaluate if the tipo is a subclass of the klasses
and not a datetimelike
Evaluate if the tipo is a subclass of the klasses
and not a datetimelike.
"""
return lambda tipo: (
issubclass(tipo, klasses)
Expand Down Expand Up @@ -674,7 +674,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of an integer dtype.

Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
Unlike in `is_any_int_dtype`, timedelta64 instances will return False.

The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
as integer by this function.
Expand Down Expand Up @@ -726,7 +726,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of a signed integer dtype.

Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
Unlike in `is_any_int_dtype`, timedelta64 instances will return False.

The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
as integer by this function.
Expand Down Expand Up @@ -1521,7 +1521,7 @@ def is_complex_dtype(arr_or_dtype) -> bool:

def _is_dtype(arr_or_dtype, condition) -> bool:
"""
Return a boolean if the condition is satisfied for the arr_or_dtype.
Return true if the condition is satisfied for the arr_or_dtype.

Parameters
----------
Expand Down Expand Up @@ -1580,7 +1580,7 @@ def get_dtype(arr_or_dtype) -> DtypeObj:

def _is_dtype_type(arr_or_dtype, condition) -> bool:
"""
Return a boolean if the condition is satisfied for the arr_or_dtype.
Return true if the condition is satisfied for the arr_or_dtype.

Parameters
----------
Expand Down
21 changes: 18 additions & 3 deletions pandas/tests/arrays/integer/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def test_from_dtype_from_float(data):


def test_conversions(data_missing):

# astype to object series
df = pd.DataFrame({"A": data_missing})
result = df["A"].astype("object")
Expand Down Expand Up @@ -123,7 +122,6 @@ def test_to_integer_array_none_is_nan(a, b):
"values",
[
["foo", "bar"],
["1", "2"],
"foo",
1,
1.0,
Expand All @@ -137,13 +135,14 @@ def test_to_integer_array_error(values):
# error in converting existing arrays to IntegerArrays
msg = (
r"(:?.* cannot be converted to an IntegerDtype)"
r"|(invalid literal for int\(\) with base 10: .*)"
r"|(:?values must be a 1D list-like)"
r"|(Cannot pass scalar)"
)
with pytest.raises((ValueError, TypeError), match=msg):
pd.array(values, dtype="Int64")

with pytest.raises(TypeError, match=msg):
with pytest.raises((ValueError, TypeError), match=msg):
IntegerArray._from_sequence(values)


Expand Down Expand Up @@ -181,6 +180,22 @@ def test_to_integer_array_float():
assert result.dtype == Int64Dtype()


def test_to_integer_array_str():
result = IntegerArray._from_sequence(["1", "2", None])
expected = pd.array([1, 2, np.nan], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1", "2", ""])

with pytest.raises(
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
):
IntegerArray._from_sequence(["1.5", "2.0"])


@pytest.mark.parametrize(
"bool_values, int_values, target_dtype, expected_dtype",
[
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,23 @@ def test_dtype_multi_index(all_parsers):
)

tm.assert_frame_equal(result, expected)


def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
# GH 25472
parser = all_parsers
dtype = any_int_ea_dtype

data = """a,b,c
,3,5
1,,6
2,4,"""
expected = DataFrame(
{
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
"b": pd.array([3, pd.NA, 4], dtype=dtype),
"c": pd.array([5, 6, pd.NA], dtype=dtype),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)