Skip to content

Commit c556062

Browse files
authored
Fix str dtype -> IntegerDtype conversions (#43949)
1 parent 01cc1ee commit c556062

File tree

5 files changed

+56
-15
lines changed

5 files changed

+56
-15
lines changed

Diff for: doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ Conversion
470470
^^^^^^^^^^
471471
- Bug in :class:`UInt64Index` constructor when passing a list containing both positive integers small enough to cast to int64 and integers too large too hold in int64 (:issue:`42201`)
472472
- Bug in :class:`Series` constructor returning 0 for missing values with dtype ``int64`` and ``False`` for dtype ``bool`` (:issue:`43017`, :issue:`43018`)
473+
- Bug in :class:`IntegerDtype` not allowing coercion from string dtype (:issue:`25472`)
473474
- Bug in :func:`to_datetime` with ``arg:xr.DataArray`` and ``unit="ns"`` specified raises TypeError (:issue:`44053`)
474475
-
475476

Diff for: pandas/core/arrays/integer.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
is_integer_dtype,
3333
is_list_like,
3434
is_object_dtype,
35+
is_string_dtype,
3536
pandas_dtype,
3637
)
3738
from pandas.core.dtypes.missing import isna
@@ -124,12 +125,10 @@ def safe_cast(values, dtype, copy: bool):
124125
Safely cast the values to the dtype if they
125126
are equivalent, meaning floats must be equivalent to the
126127
ints.
127-
128128
"""
129129
try:
130130
return values.astype(dtype, casting="safe", copy=copy)
131131
except TypeError as err:
132-
133132
casted = values.astype(dtype, copy=copy)
134133
if (casted == values).all():
135134
return casted
@@ -143,7 +142,7 @@ def coerce_to_array(
143142
values, dtype, mask=None, copy: bool = False
144143
) -> tuple[np.ndarray, np.ndarray]:
145144
"""
146-
Coerce the input values array to numpy arrays with a mask
145+
Coerce the input values array to numpy arrays with a mask.
147146
148147
Parameters
149148
----------
@@ -187,7 +186,8 @@ def coerce_to_array(
187186
return values, mask
188187

189188
values = np.array(values, copy=copy)
190-
if is_object_dtype(values):
189+
inferred_type = None
190+
if is_object_dtype(values) or is_string_dtype(values):
191191
inferred_type = lib.infer_dtype(values, skipna=True)
192192
if inferred_type == "empty":
193193
values = np.empty(len(values))
@@ -198,6 +198,8 @@ def coerce_to_array(
198198
"mixed-integer",
199199
"integer-na",
200200
"mixed-integer-float",
201+
"string",
202+
"unicode",
201203
]:
202204
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
203205

@@ -230,7 +232,10 @@ def coerce_to_array(
230232
if mask.any():
231233
values = values.copy()
232234
values[mask] = 1
233-
values = safe_cast(values, dtype, copy=False)
235+
if inferred_type in ("string", "unicode"):
236+
# casts from str are always safe since they raise
237+
# a ValueError if the str cannot be parsed into an int
238+
values = values.astype(dtype, copy=copy)
234239
else:
235240
values = safe_cast(values, dtype, copy=False)
236241

Diff for: pandas/core/dtypes/common.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,14 @@ def ensure_python_int(value: int | np.integer) -> int:
143143

144144

145145
def classes(*klasses) -> Callable:
146-
"""evaluate if the tipo is a subclass of the klasses"""
146+
"""Evaluate if the tipo is a subclass of the klasses."""
147147
return lambda tipo: issubclass(tipo, klasses)
148148

149149

150150
def classes_and_not_datetimelike(*klasses) -> Callable:
151151
"""
152-
evaluate if the tipo is a subclass of the klasses
153-
and not a datetimelike
152+
Evaluate if the tipo is a subclass of the klasses
153+
and not a datetimelike.
154154
"""
155155
return lambda tipo: (
156156
issubclass(tipo, klasses)
@@ -674,7 +674,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
674674
"""
675675
Check whether the provided array or dtype is of an integer dtype.
676676
677-
Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
677+
Unlike in `is_any_int_dtype`, timedelta64 instances will return False.
678678
679679
The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
680680
as integer by this function.
@@ -726,7 +726,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
726726
"""
727727
Check whether the provided array or dtype is of a signed integer dtype.
728728
729-
Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
729+
Unlike in `is_any_int_dtype`, timedelta64 instances will return False.
730730
731731
The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
732732
as integer by this function.
@@ -1521,7 +1521,7 @@ def is_complex_dtype(arr_or_dtype) -> bool:
15211521

15221522
def _is_dtype(arr_or_dtype, condition) -> bool:
15231523
"""
1524-
Return a boolean if the condition is satisfied for the arr_or_dtype.
1524+
Return true if the condition is satisfied for the arr_or_dtype.
15251525
15261526
Parameters
15271527
----------
@@ -1580,7 +1580,7 @@ def get_dtype(arr_or_dtype) -> DtypeObj:
15801580

15811581
def _is_dtype_type(arr_or_dtype, condition) -> bool:
15821582
"""
1583-
Return a boolean if the condition is satisfied for the arr_or_dtype.
1583+
Return true if the condition is satisfied for the arr_or_dtype.
15841584
15851585
Parameters
15861586
----------

Diff for: pandas/tests/arrays/integer/test_construction.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def test_from_dtype_from_float(data):
4444

4545

4646
def test_conversions(data_missing):
47-
4847
# astype to object series
4948
df = pd.DataFrame({"A": data_missing})
5049
result = df["A"].astype("object")
@@ -123,7 +122,6 @@ def test_to_integer_array_none_is_nan(a, b):
123122
"values",
124123
[
125124
["foo", "bar"],
126-
["1", "2"],
127125
"foo",
128126
1,
129127
1.0,
@@ -137,13 +135,14 @@ def test_to_integer_array_error(values):
137135
# error in converting existing arrays to IntegerArrays
138136
msg = (
139137
r"(:?.* cannot be converted to an IntegerDtype)"
138+
r"|(invalid literal for int\(\) with base 10: .*)"
140139
r"|(:?values must be a 1D list-like)"
141140
r"|(Cannot pass scalar)"
142141
)
143142
with pytest.raises((ValueError, TypeError), match=msg):
144143
pd.array(values, dtype="Int64")
145144

146-
with pytest.raises(TypeError, match=msg):
145+
with pytest.raises((ValueError, TypeError), match=msg):
147146
IntegerArray._from_sequence(values)
148147

149148

@@ -181,6 +180,22 @@ def test_to_integer_array_float():
181180
assert result.dtype == Int64Dtype()
182181

183182

183+
def test_to_integer_array_str():
184+
result = IntegerArray._from_sequence(["1", "2", None])
185+
expected = pd.array([1, 2, np.nan], dtype="Int64")
186+
tm.assert_extension_array_equal(result, expected)
187+
188+
with pytest.raises(
189+
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
190+
):
191+
IntegerArray._from_sequence(["1", "2", ""])
192+
193+
with pytest.raises(
194+
ValueError, match=r"invalid literal for int\(\) with base 10: .*"
195+
):
196+
IntegerArray._from_sequence(["1.5", "2.0"])
197+
198+
184199
@pytest.mark.parametrize(
185200
"bool_values, int_values, target_dtype, expected_dtype",
186201
[

Diff for: pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+20
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,23 @@ def test_dtype_multi_index(all_parsers):
314314
)
315315

316316
tm.assert_frame_equal(result, expected)
317+
318+
319+
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
320+
# GH 25472
321+
parser = all_parsers
322+
dtype = any_int_ea_dtype
323+
324+
data = """a,b,c
325+
,3,5
326+
1,,6
327+
2,4,"""
328+
expected = DataFrame(
329+
{
330+
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
331+
"b": pd.array([3, pd.NA, 4], dtype=dtype),
332+
"c": pd.array([5, 6, pd.NA], dtype=dtype),
333+
}
334+
)
335+
actual = parser.read_csv(StringIO(data), dtype=dtype)
336+
tm.assert_frame_equal(actual, expected)

0 commit comments

Comments
 (0)