CLN: Remove deprecated read_*(date_parser=) (#58624)

mroeschke · web-flow · commit ad06bbb8a9d6 · 2024-05-07T21:47:58.000-04:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -279,19 +279,6 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
 keep_date_col : boolean, default ``False``
   If ``True`` and parse_dates specifies combining multiple columns then keep the
   original columns.
-date_parser : function, default ``None``
-  Function to use for converting a sequence of string columns to an array of
-  datetime instances. The default uses ``dateutil.parser.parser`` to do the
-  conversion. pandas will try to call date_parser in three different ways,
-  advancing to the next if an exception occurs: 1) Pass one or more arrays (as
-  defined by parse_dates) as arguments; 2) concatenate (row-wise) the string
-  values from the columns defined by parse_dates into a single array and pass
-  that; and 3) call date_parser once for each row using one or more strings
-  (corresponding to the columns defined by parse_dates) as arguments.
-
-  .. deprecated:: 2.0.0
-   Use ``date_format`` instead, or read in as ``object`` and then apply
-   :func:`to_datetime` as-needed.
 date_format : str or dict of column -> format, default ``None``
    If used in conjunction with ``parse_dates``, will parse dates according to this
    format. For anything more complex,
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -254,6 +254,7 @@ Removal of prior version deprecations/changes
 - Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`)
 - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
 - Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
+- Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
 - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
 - Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
 - Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -240,20 +240,6 @@
     For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
 
     Note: A fast-path exists for iso8601-formatted dates.
-date_parser : function, optional
-    Function to use for converting a sequence of string columns to an array of
-    datetime instances. The default uses ``dateutil.parser.parser`` to do the
-    conversion. Pandas will try to call `date_parser` in three different ways,
-    advancing to the next if an exception occurs: 1) Pass one or more arrays
-    (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
-    string values from the columns defined by `parse_dates` into a single array
-    and pass that; and 3) call `date_parser` once for each row using one or
-    more strings (corresponding to the columns defined by `parse_dates`) as
-    arguments.
-
-    .. deprecated:: 2.0.0
-       Use ``date_format`` instead, or read in as ``object`` and then apply
-       :func:`to_datetime` as-needed.
 date_format : str or dict of column -> format, default ``None``
    If used in conjunction with ``parse_dates``, will parse dates according to this
    format. For anything more complex,
@@ -398,7 +384,6 @@ def read_excel(
     na_filter: bool = ...,
     verbose: bool = ...,
     parse_dates: list | dict | bool = ...,
-    date_parser: Callable | lib.NoDefault = ...,
     date_format: dict[Hashable, str] | str | None = ...,
     thousands: str | None = ...,
     decimal: str = ...,
@@ -436,7 +421,6 @@ def read_excel(
     na_filter: bool = ...,
     verbose: bool = ...,
     parse_dates: list | dict | bool = ...,
-    date_parser: Callable | lib.NoDefault = ...,
     date_format: dict[Hashable, str] | str | None = ...,
     thousands: str | None = ...,
     decimal: str = ...,
@@ -474,7 +458,6 @@ def read_excel(
     na_filter: bool = True,
     verbose: bool = False,
     parse_dates: list | dict | bool = False,
-    date_parser: Callable | lib.NoDefault = lib.no_default,
     date_format: dict[Hashable, str] | str | None = None,
     thousands: str | None = None,
     decimal: str = ".",
@@ -521,7 +504,6 @@ def read_excel(
             na_filter=na_filter,
             verbose=verbose,
             parse_dates=parse_dates,
-            date_parser=date_parser,
             date_format=date_format,
             thousands=thousands,
             decimal=decimal,
@@ -726,7 +708,6 @@ def parse(
         na_values=None,
         verbose: bool = False,
         parse_dates: list | dict | bool = False,
-        date_parser: Callable | lib.NoDefault = lib.no_default,
         date_format: dict[Hashable, str] | str | None = None,
         thousands: str | None = None,
         decimal: str = ".",
@@ -795,7 +776,6 @@ def parse(
                 false_values=false_values,
                 na_values=na_values,
                 parse_dates=parse_dates,
-                date_parser=date_parser,
                 date_format=date_format,
                 thousands=thousands,
                 decimal=decimal,
@@ -829,7 +809,6 @@ def _parse_sheet(
         false_values: Iterable[Hashable] | None = None,
         na_values=None,
         parse_dates: list | dict | bool = False,
-        date_parser: Callable | lib.NoDefault = lib.no_default,
         date_format: dict[Hashable, str] | str | None = None,
         thousands: str | None = None,
         decimal: str = ".",
@@ -942,7 +921,6 @@ def _parse_sheet(
                 na_values=na_values,
                 skip_blank_lines=False,  # GH 39808
                 parse_dates=parse_dates,
-                date_parser=date_parser,
                 date_format=date_format,
                 thousands=thousands,
                 decimal=decimal,
@@ -1648,7 +1626,6 @@ def parse(
         nrows: int | None = None,
         na_values=None,
         parse_dates: list | dict | bool = False,
-        date_parser: Callable | lib.NoDefault = lib.no_default,
         date_format: str | dict[Hashable, str] | None = None,
         thousands: str | None = None,
         comment: str | None = None,
@@ -1737,20 +1714,6 @@ def parse(
             ``pd.to_datetime`` after ``pd.read_excel``.
 
             Note: A fast-path exists for iso8601-formatted dates.
-        date_parser : function, optional
-            Function to use for converting a sequence of string columns to an array of
-            datetime instances. The default uses ``dateutil.parser.parser`` to do the
-            conversion. Pandas will try to call `date_parser` in three different ways,
-            advancing to the next if an exception occurs: 1) Pass one or more arrays
-            (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
-            string values from the columns defined by `parse_dates` into a single array
-            and pass that; and 3) call `date_parser` once for each row using one or
-            more strings (corresponding to the columns defined by `parse_dates`) as
-            arguments.
-
-            .. deprecated:: 2.0.0
-               Use ``date_format`` instead, or read in as ``object`` and then apply
-               :func:`to_datetime` as-needed.
         date_format : str or dict of column -> format, default ``None``
            If used in conjunction with ``parse_dates``, will parse dates
            according to this format. For anything more complex,
@@ -1810,7 +1773,6 @@ def parse(
             nrows=nrows,
             na_values=na_values,
             parse_dates=parse_dates,
-            date_parser=date_parser,
             date_format=date_format,
             thousands=thousands,
             comment=comment,
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -3,7 +3,6 @@
 from collections import defaultdict
 from copy import copy
 import csv
-import datetime
 from enum import Enum
 import itertools
 from typing import (
@@ -127,7 +126,6 @@ def __init__(self, kwds) -> None:
 
         self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
         self._parse_date_cols: Iterable = []
-        self.date_parser = kwds.pop("date_parser", lib.no_default)
         self.date_format = kwds.pop("date_format", None)
         self.dayfirst = kwds.pop("dayfirst", False)
         self.keep_date_col = kwds.pop("keep_date_col", False)
@@ -146,7 +144,6 @@ def __init__(self, kwds) -> None:
         self.cache_dates = kwds.pop("cache_dates", True)
 
         self._date_conv = _make_date_converter(
-            date_parser=self.date_parser,
             date_format=self.date_format,
             dayfirst=self.dayfirst,
             cache_dates=self.cache_dates,
@@ -1120,84 +1117,39 @@ def _get_empty_meta(
 
 
 def _make_date_converter(
-    date_parser=lib.no_default,
     dayfirst: bool = False,
     cache_dates: bool = True,
     date_format: dict[Hashable, str] | str | None = None,
 ):
-    if date_parser is not lib.no_default:
-        warnings.warn(
-            "The argument 'date_parser' is deprecated and will "
-            "be removed in a future version. "
-            "Please use 'date_format' instead, or read your data in as 'object' dtype "
-            "and then call 'to_datetime'.",
-            FutureWarning,
-            stacklevel=find_stack_level(),
-        )
-    if date_parser is not lib.no_default and date_format is not None:
-        raise TypeError("Cannot use both 'date_parser' and 'date_format'")
-
-    def unpack_if_single_element(arg):
-        # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
-        if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:
-            return arg[0]
-        return arg
-
     def converter(*date_cols, col: Hashable):
         if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
             return date_cols[0]
+        # TODO: Can we remove concat_date_cols after deprecation of parsing
+        # multiple cols?
+        strs = parsing.concat_date_cols(date_cols)
+        date_fmt = (
+            date_format.get(col) if isinstance(date_format, dict) else date_format
+        )
 
-        if date_parser is lib.no_default:
-            strs = parsing.concat_date_cols(date_cols)
-            date_fmt = (
-                date_format.get(col) if isinstance(date_format, dict) else date_format
+        str_objs = ensure_object(strs)
+        try:
+            result = tools.to_datetime(
+                str_objs,
+                format=date_fmt,
+                utc=False,
+                dayfirst=dayfirst,
+                cache=cache_dates,
             )
+        except (ValueError, TypeError):
+            # test_usecols_with_parse_dates4
+            # test_multi_index_parse_dates
+            return str_objs
 
-            str_objs = ensure_object(strs)
-            try:
-                result = tools.to_datetime(
-                    str_objs,
-                    format=date_fmt,
-                    utc=False,
-                    dayfirst=dayfirst,
-                    cache=cache_dates,
-                )
-            except (ValueError, TypeError):
-                # test_usecols_with_parse_dates4
-                return str_objs
-
-            if isinstance(result, DatetimeIndex):
-                arr = result.to_numpy()
-                arr.flags.writeable = True
-                return arr
-            return result._values
-        else:
-            try:
-                pre_parsed = date_parser(
-                    *(unpack_if_single_element(arg) for arg in date_cols)
-                )
-                try:
-                    result = tools.to_datetime(
-                        pre_parsed,
-                        cache=cache_dates,
-                    )
-                except (ValueError, TypeError):
-                    # test_read_csv_with_custom_date_parser
-                    result = pre_parsed
-                if isinstance(result, datetime.datetime):
-                    raise Exception("scalar parser")
-                return result
-            except Exception:
-                # e.g. test_datetime_fractional_seconds
-                pre_parsed = parsing.try_parse_dates(
-                    parsing.concat_date_cols(date_cols),
-                    parser=date_parser,
-                )
-                try:
-                    return tools.to_datetime(pre_parsed)
-                except (ValueError, TypeError):
-                    # TODO: not reached in tests 2023-10-27; needed?
-                    return pre_parsed
+        if isinstance(result, DatetimeIndex):
+            arr = result.to_numpy()
+            arr.flags.writeable = True
+            return arr
+        return result._values
 
     return converter
 
@@ -1230,7 +1182,6 @@ def converter(*date_cols, col: Hashable):
     "parse_dates": False,
     "keep_date_col": False,
     "dayfirst": False,
-    "date_parser": lib.no_default,
     "date_format": None,
     "usecols": None,
     # 'iterator': False,
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -119,7 +119,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
         skip_blank_lines: bool
         parse_dates: bool | Sequence[Hashable] | None
         keep_date_col: bool | lib.NoDefault
-        date_parser: Callable | lib.NoDefault
         date_format: str | dict[Hashable, str] | None
         dayfirst: bool
         cache_dates: bool
@@ -306,8 +305,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
     The behavior is as follows:
 
     * ``bool``. If ``True`` -> try parsing the index.
-    * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are
-      specified.
+    * ``None``. Behaves like ``True`` if ``date_format`` is specified.
     * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3
       each as a separate date column.
     * ``list`` of ``list``. e.g.  If ``[[1, 3]]`` -> combine columns 1 and 3 and parse
@@ -325,20 +323,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
 keep_date_col : bool, default False
     If ``True`` and ``parse_dates`` specifies combining multiple columns then
     keep the original columns.
-date_parser : Callable, optional
-    Function to use for converting a sequence of string columns to an array of
-    ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the
-    conversion. pandas will try to call ``date_parser`` in three different ways,
-    advancing to the next if an exception occurs: 1) Pass one or more arrays
-    (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the
-    string values from the columns defined by ``parse_dates`` into a single array
-    and pass that; and 3) call ``date_parser`` once for each row using one or
-    more strings (corresponding to the columns defined by ``parse_dates``) as
-    arguments.
-
-    .. deprecated:: 2.0.0
-       Use ``date_format`` instead, or read in as ``object`` and then apply
-       :func:`~pandas.to_datetime` as-needed.
 date_format : str or dict of column -> format, optional
     Format to use for parsing dates when used in conjunction with ``parse_dates``.
     The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
@@ -624,13 +608,10 @@ def _read(
     filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
 ) -> DataFrame | TextFileReader:
     """Generic reader of line files."""
-    # if we pass a date_parser and parse_dates=False, we should not parse the
+    # if we pass a date_format and parse_dates=False, we should not parse the
     # dates GH#44366
     if kwds.get("parse_dates", None) is None:
-        if (
-            kwds.get("date_parser", lib.no_default) is lib.no_default
-            and kwds.get("date_format", None) is None
-        ):
+        if kwds.get("date_format", None) is None:
             kwds["parse_dates"] = False
         else:
             kwds["parse_dates"] = True
@@ -749,7 +730,6 @@ def read_csv(
     # Datetime Handling
     parse_dates: bool | Sequence[Hashable] | None = None,
     keep_date_col: bool | lib.NoDefault = lib.no_default,
-    date_parser: Callable | lib.NoDefault = lib.no_default,
     date_format: str | dict[Hashable, str] | None = None,
     dayfirst: bool = False,
     cache_dates: bool = True,
@@ -928,7 +908,6 @@ def read_table(
     # Datetime Handling
     parse_dates: bool | Sequence[Hashable] | None = None,
     keep_date_col: bool | lib.NoDefault = lib.no_default,
-    date_parser: Callable | lib.NoDefault = lib.no_default,
     date_format: str | dict[Hashable, str] | None = None,
     dayfirst: bool = False,
     cache_dates: bool = True,
@@ -1638,9 +1617,6 @@ def TextParser(*args, **kwds) -> TextFileReader:
         Comment out remainder of line
     parse_dates : bool, default False
     keep_date_col : bool, default False
-    date_parser : function, optional
-
-        .. deprecated:: 2.0.0
     date_format : str or dict of column -> format, default ``None``
 
         .. versionadded:: 2.0.0
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -295,19 +295,6 @@ def test_read_excel_parse_dates(self, tmp_excel):
         res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0)
         tm.assert_frame_equal(df, res)
 
-        date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y")
-        with tm.assert_produces_warning(
-            FutureWarning,
-            match="use 'date_format' instead",
-            raise_on_extra_warnings=False,
-        ):
-            res = pd.read_excel(
-                tmp_excel,
-                parse_dates=["date_strings"],
-                date_parser=date_parser,
-                index_col=0,
-            )
-        tm.assert_frame_equal(df, res)
         res = pd.read_excel(
             tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0
         )
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py