pandas-dev · jreback · Oct 19, 2021 · Oct 9, 2021 · Oct 10, 2021 · Oct 11, 2021
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -470,6 +470,7 @@ Conversion
 ^^^^^^^^^^
 - Bug in :class:`UInt64Index` constructor when passing a list containing both positive integers small enough to cast to int64 and integers too large too hold in int64 (:issue:`42201`)
 - Bug in :class:`Series` constructor returning 0 for missing values with dtype ``int64`` and ``False`` for dtype ``bool`` (:issue:`43017`, :issue:`43018`)
+- Bug in :class:`IntegerDtype` not allowing coercion from string dtype (:issue:`25472`)
 - Bug in :func:`to_datetime` with ``arg:xr.DataArray`` and ``unit="ns"`` specified raises TypeError (:issue:`44053`)
 -
 

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -32,6 +32,7 @@
     is_integer_dtype,
     is_list_like,
     is_object_dtype,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.missing import isna
@@ -124,12 +125,10 @@ def safe_cast(values, dtype, copy: bool):
     Safely cast the values to the dtype if they
     are equivalent, meaning floats must be equivalent to the
     ints.
-
     """
     try:
         return values.astype(dtype, casting="safe", copy=copy)
     except TypeError as err:
-
         casted = values.astype(dtype, copy=copy)
         if (casted == values).all():
             return casted
@@ -143,7 +142,7 @@ def coerce_to_array(
     values, dtype, mask=None, copy: bool = False
 ) -> tuple[np.ndarray, np.ndarray]:
     """
-    Coerce the input values array to numpy arrays with a mask
+    Coerce the input values array to numpy arrays with a mask.
 
     Parameters
     ----------
@@ -187,7 +186,8 @@ def coerce_to_array(
         return values, mask
 
     values = np.array(values, copy=copy)
-    if is_object_dtype(values):
+    inferred_type = None
+    if is_object_dtype(values) or is_string_dtype(values):
         inferred_type = lib.infer_dtype(values, skipna=True)
         if inferred_type == "empty":
             values = np.empty(len(values))
@@ -198,6 +198,8 @@ def coerce_to_array(
             "mixed-integer",
             "integer-na",
             "mixed-integer-float",
+            "string",
+            "unicode",
         ]:
             raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
 
@@ -230,7 +232,10 @@ def coerce_to_array(
     if mask.any():
         values = values.copy()
         values[mask] = 1
-        values = safe_cast(values, dtype, copy=False)
+    if inferred_type in ("string", "unicode"):
+        # casts from str are always safe since they raise
+        # a ValueError if the str cannot be parsed into an int
+        values = values.astype(dtype, copy=copy)
     else:
         values = safe_cast(values, dtype, copy=False)
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -143,14 +143,14 @@ def ensure_python_int(value: int | np.integer) -> int:
 
 
 def classes(*klasses) -> Callable:
-    """evaluate if the tipo is a subclass of the klasses"""
+    """Evaluate if the tipo is a subclass of the klasses."""
     return lambda tipo: issubclass(tipo, klasses)
 
 
 def classes_and_not_datetimelike(*klasses) -> Callable:
     """
-    evaluate if the tipo is a subclass of the klasses
-    and not a datetimelike
+    Evaluate if the tipo is a subclass of the klasses
+    and not a datetimelike.
     """
     return lambda tipo: (
         issubclass(tipo, klasses)
@@ -674,7 +674,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
     """
     Check whether the provided array or dtype is of an integer dtype.
 
-    Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
+    Unlike in `is_any_int_dtype`, timedelta64 instances will return False.
 
     The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
     as integer by this function.
@@ -726,7 +726,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
     """
     Check whether the provided array or dtype is of a signed integer dtype.
 
-    Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
+    Unlike in `is_any_int_dtype`, timedelta64 instances will return False.
 
     The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
     as integer by this function.
@@ -1521,7 +1521,7 @@ def is_complex_dtype(arr_or_dtype) -> bool:
 
 def _is_dtype(arr_or_dtype, condition) -> bool:
     """
-    Return a boolean if the condition is satisfied for the arr_or_dtype.
+    Return true if the condition is satisfied for the arr_or_dtype.
 
     Parameters
     ----------
@@ -1580,7 +1580,7 @@ def get_dtype(arr_or_dtype) -> DtypeObj:
 
 def _is_dtype_type(arr_or_dtype, condition) -> bool:
     """
-    Return a boolean if the condition is satisfied for the arr_or_dtype.
+    Return true if the condition is satisfied for the arr_or_dtype.
 
     Parameters
     ----------

diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py
@@ -44,7 +44,6 @@ def test_from_dtype_from_float(data):
 
 
 def test_conversions(data_missing):
-
     # astype to object series
     df = pd.DataFrame({"A": data_missing})
     result = df["A"].astype("object")
@@ -123,7 +122,6 @@ def test_to_integer_array_none_is_nan(a, b):
     "values",
     [
         ["foo", "bar"],
-        ["1", "2"],
         "foo",
         1,
         1.0,
@@ -137,13 +135,14 @@ def test_to_integer_array_error(values):
     # error in converting existing arrays to IntegerArrays
     msg = (
         r"(:?.* cannot be converted to an IntegerDtype)"
+        r"|(invalid literal for int\(\) with base 10: .*)"
         r"|(:?values must be a 1D list-like)"
         r"|(Cannot pass scalar)"
     )
     with pytest.raises((ValueError, TypeError), match=msg):
         pd.array(values, dtype="Int64")
 
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises((ValueError, TypeError), match=msg):
         IntegerArray._from_sequence(values)
 
 
@@ -181,6 +180,22 @@ def test_to_integer_array_float():
     assert result.dtype == Int64Dtype()
 
 
+def test_to_integer_array_str():
+    result = IntegerArray._from_sequence(["1", "2", None])
+    expected = pd.array([1, 2, np.nan], dtype="Int64")
+    tm.assert_extension_array_equal(result, expected)
+
+    with pytest.raises(
+        ValueError, match=r"invalid literal for int\(\) with base 10: .*"
+    ):
+        IntegerArray._from_sequence(["1", "2", ""])
+
+    with pytest.raises(
+        ValueError, match=r"invalid literal for int\(\) with base 10: .*"
+    ):
+        IntegerArray._from_sequence(["1.5", "2.0"])
+
+
 @pytest.mark.parametrize(
     "bool_values, int_values, target_dtype, expected_dtype",
     [

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -314,3 +314,23 @@ def test_dtype_multi_index(all_parsers):
     )
 
     tm.assert_frame_equal(result, expected)
+
+
+def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
+    # GH 25472
+    parser = all_parsers
+    dtype = any_int_ea_dtype
+
+    data = """a,b,c
+,3,5
+1,,6
+2,4,"""
+    expected = DataFrame(
+        {
+            "a": pd.array([pd.NA, 1, 2], dtype=dtype),
+            "b": pd.array([3, pd.NA, 4], dtype=dtype),
+            "c": pd.array([5, 6, pd.NA], dtype=dtype),
+        }
+    )
+    actual = parser.read_csv(StringIO(data), dtype=dtype)
+    tm.assert_frame_equal(actual, expected)