From 8be7259ae1f75ccb536e29d48eaaccbac55d6aad Mon Sep 17 00:00:00 2001
From: halvo <jordan@vrasa.com>
Date: Thu, 8 May 2025 23:36:54 -0400
Subject: [PATCH 1/2] BUG: Fixed slow plotting with DatetimeIndex

---
 pandas/plotting/_matplotlib/converter.py      |  52 ++++-
 pandas/plotting/_matplotlib/core.py           | 184 +++++++++++++++++-
 pandas/plotting/_matplotlib/timeseries.py     | 105 ++++++++--
 .../test_datetimeindex_performance.py         |  44 +++++
 4 files changed, 358 insertions(+), 27 deletions(-)
 create mode 100644 pandas/tests/plotting/test_datetimeindex_performance.py

diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py
index 774062e0f0412..12d329369781d 100644
--- a/pandas/plotting/_matplotlib/converter.py
+++ b/pandas/plotting/_matplotlib/converter.py
@@ -40,6 +40,7 @@
     is_integer_dtype,
     is_nested_list_like,
 )
+from pandas.core.dtypes.generic import ABCDatetimeIndex
 
 from pandas import (
     Index,
@@ -301,6 +302,7 @@ def try_parse(values):
             except Exception:
                 return values
 
+        # Fast path for single values
         if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)):
             return mdates.date2num(values)
         elif is_integer(values) or is_float(values):
@@ -308,10 +310,29 @@ def try_parse(values):
         elif isinstance(values, str):
             return try_parse(values)
         elif isinstance(values, (list, tuple, np.ndarray, Index, Series)):
+            # Check for cache to avoid redundant conversions
+            # This is especially important for DataFrames with the same DatetimeIndex
+            # for all columns
+            if isinstance(values, Index) and hasattr(axis, "_converter_cache"):
+                cache_key = id(values)
+                if cache_key in axis._converter_cache:
+                    return axis._converter_cache[cache_key]
+
             if isinstance(values, Series):
                 # https://github.com/matplotlib/matplotlib/issues/11391
                 # Series was skipped. Convert to DatetimeIndex to get asi8
                 values = Index(values)
+
+            # For DatetimeIndex objects, directly use _mpl_repr() for better efficiency
+            if isinstance(values, ABCDatetimeIndex):
+                result = values._mpl_repr()
+                # Cache result for reuse with subsequent columns
+                if hasattr(axis, "_converter_cache"):
+                    axis._converter_cache[id(values)] = result
+                elif axis is not None:
+                    axis._converter_cache = {id(values): result}
+                return result
+
             if isinstance(values, Index):
                 values = values.values
             if not isinstance(values, np.ndarray):
@@ -325,7 +346,15 @@ def try_parse(values):
             except Exception:
                 pass
 
-            values = mdates.date2num(values)
+            result = mdates.date2num(values)
+
+            # Cache result if possible
+            if hasattr(axis, "_converter_cache"):
+                axis._converter_cache[id(values)] = result
+            elif axis is not None:
+                axis._converter_cache = {id(values): result}
+
+            return result
 
         return values
 
@@ -426,10 +455,29 @@ def __call__(self):
             )
 
         interval = self._get_interval()
-        freq = f"{interval}ms"
+
+        # Use seconds instead of milliseconds for large intervals to improve performance
+        if interval >= 1000:
+            # Use seconds instead of ms for better performance
+            sec_interval = interval / 1000
+            freq = f"{sec_interval}s"
+        else:
+            freq = f"{interval}ms"
+
         tz = self.tz.tzname(None)
         st = dmin.replace(tzinfo=None)
         ed = dmax.replace(tzinfo=None)
+
+        # Limit ticks for large date ranges to improve performance
+        date_diff = (ed - st).total_seconds()
+        if (
+            date_diff > 86400 * 365 and interval < 1000
+        ):  # Year+ of data with small interval
+            # Generate limited ticks for large datasets instead of a full date range
+            num_ticks = max_millis_ticks
+            tick_locs = np.linspace(mdates.date2num(st), mdates.date2num(ed), num_ticks)
+            return tick_locs
+
         all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object)
 
         try:
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index 1c7e1ab57b2a9..62c4d735115a8 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -1549,9 +1549,157 @@ def __init__(self, data, **kwargs) -> None:
             self.data = self.data.fillna(value=0)
 
     def _make_plot(self, fig: Figure) -> None:
+        """Create the plot.
+
+        This method contains a fast path optimization for DataFrames with DatetimeIndex
+        and multiple columns. For large DataFrames with DatetimeIndex, plotting can be
+        very slow due to the overhead of date conversions for each column.
+
+        The optimization follows this strategy:
+        1. For the first column only: Use standard DatetimeIndex plotting to get ticks
+        2. For remaining columns: Plot with a simpler numeric index (much faster)
+        3. Apply the datetime tick labels from the first plot to all other plots
+
+        This avoids redundant DatetimeIndex -> PeriodIndex conversions and tick
+        calculations when plotting many columns with the same index.
+
+        The optimization can yield a 10x+ speedup on large DataFrames with many columns.
+        """
+        # Fast path for DatetimeIndex with many columns
+        # Implement the same strategy as the user's workaround that showed 11x speedup
+        if (
+            self._is_ts_plot()
+            and isinstance(self.data.index, ABCDatetimeIndex)
+            and len(self.data.columns) >= 2
+        ):  # Need at least 2 columns for this optimization
+            # Get the first axis for the plot
+            ax = self._get_ax(0)
+
+            # STEP 1: Plot only the first column to get datetime ticks
+            first_column = self.data.iloc[:, 0]
+            first_series = first_column.copy()
+            first_style = None
+
+            # Apply colors and style just for first column
+            colors = self._get_colors()
+            first_col_label = self.data.columns[0]
+            kwds = self.kwds.copy()
+            if self.color is not None:
+                kwds["color"] = self.color
+
+            # Set up style for first column
+            first_style, kwds = self._apply_style_colors(
+                colors,
+                kwds,
+                0,
+                first_col_label,  # type: ignore[arg-type]
+            )
+
+            # Add label to kwds for the first column
+            first_label = pprint_thing(first_col_label)
+            first_label = self._mark_right_label(first_label, index=0)
+            kwds["label"] = first_label
+
+            # Plot the first column with DatetimeIndex to set up ticks
+            first_ax = self._get_ax(0)
+            # We need to specifically add column_num for stacking
+            kwds["column_num"] = 0
+            lines = self._ts_plot(
+                first_ax, None, first_series, style=first_style, **kwds
+            )
+
+            # Get the x-ticks and labels from the first plot
+            xticks = first_ax.get_xticks()
+            xticklabels = [label.get_text() for label in first_ax.get_xticklabels()]
+
+            # Keep reference to the first line for the legend
+            first_line = lines[0]
+            self._append_legend_handles_labels(first_line, first_label)
+
+            # STEP 2: Plot all columns with a numeric index (much faster)
+            # Reset axes for faster plotting
+            data_without_index = self.data.reset_index(drop=True)
+
+            # Plot remaining columns
+            stacking_id = self._get_stacking_id()
+            is_errorbar = com.any_not_none(*self.errors.values())
+
+            # Skip the first column and process the remaining ones
+            for i, (col_idx, (label, y)) in enumerate(
+                zip(
+                    range(1, len(data_without_index.columns)),
+                    list(data_without_index.items())[1:],
+                )
+            ):
+                # Get the actual axis for this column - use the right column index
+                # Note: i is 0-based for the remaining columns after skipping the first
+                ax = self._get_ax(col_idx)  # Use col_idx which starts from 1
+
+                # Reset kwds for each column
+                kwds = self.kwds.copy()
+                if self.color is not None:
+                    kwds["color"] = self.color
+
+                # Apply style and colors
+                style, kwds = self._apply_style_colors(
+                    colors,
+                    kwds,
+                    col_idx,  # Use 1-based index to match column
+                    label,  # type: ignore[arg-type]
+                )
+
+                # Handle any error bars
+                errors = self._get_errorbars(label=label, index=col_idx)
+                kwds = dict(kwds, **errors)
+
+                # Format the label
+                label_str = pprint_thing(label)
+                label_str = self._mark_right_label(label_str, index=col_idx)
+                kwds["label"] = label_str
+
+                # Add column number for stacking
+                kwds["column_num"] = col_idx
+
+                try:
+                    # Use regular plot (not ts_plot) for better performance
+                    newlines = self._plot(
+                        ax,
+                        data_without_index.index,  # Use numeric index for speed
+                        np.asarray(y.values),
+                        style=style,
+                        stacking_id=stacking_id,
+                        is_errorbar=is_errorbar,
+                        **kwds,
+                    )
+                    self._append_legend_handles_labels(newlines[0], label_str)
+
+                    # STEP 3: Apply the datetime x-axis formatting to each plot
+                    # Use ticks from first plot for all subsequent plots
+                    num_ticks = len(xticks)
+                    new_xticks = np.linspace(0, len(self.data.index) - 1, num_ticks)
+                    ax.set_xlim(0, len(self.data.index) - 1)
+                    ax.set_xticks(new_xticks)
+                    ax.set_xticklabels(xticklabels)
+                except Exception as e:
+                    # If anything goes wrong with the plotting, log it but don't crash
+                    # This ensures the fix doesn't introduce new issues
+                    import warnings
+
+                    warnings.warn(
+                        f"Fast path plotting failed for column {col_idx}: {e!s}. "
+                        "Falling back to regular plotting method for remaining columns",
+                        stacklevel=2,
+                    )
+                    # Return without 'return' to fall back to the normal plotting path
+                    break
+            else:
+                # If we've successfully plotted all columns, return from the method
+                # We've already plotted everything with the fast path
+                return
+
+        # Regular path for other cases
         if self._is_ts_plot():
             data = maybe_convert_index(self._get_ax(0), self.data)
-
             x = data.index  # dummy, not used
             plotf = self._ts_plot
             it = data.items()
@@ -1570,6 +1718,7 @@ def _make_plot(self, fig: Figure) -> None:
         is_errorbar = com.any_not_none(*self.errors.values())
 
         colors = self._get_colors()
+
         for i, (label, y) in enumerate(it):
             ax = self._get_ax(i)
             kwds = self.kwds.copy()
@@ -1636,15 +1785,34 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds):
         # accept x to be consistent with normal plot func,
         # x is not passed to tsplot as it uses data.index as x coordinate
         # column_num must be in kwds for stacking purpose
-        freq, data = prepare_ts_data(data, ax, kwds)
 
-        # TODO #54485
-        ax._plot_data.append((data, self._kind, kwds))  # type: ignore[attr-defined]
+        # Optimization for multi-column DatetimeIndex plots
+        if hasattr(ax, "_datetime_ticks_setup_done") and kwds.get("column_num", 0) > 0:
+            # Skip the expensive date axis setup for columns after the first one
+            # We'll just copy the ticks from the first plot
+            freq = getattr(ax, "freq", None)
+            lines = self._plot(
+                ax, data.index, np.asarray(data.values), style=style, **kwds
+            )
+
+            if hasattr(ax, "_xticks") and hasattr(ax, "_xticklabels"):
+                # Use the stored ticks and labels from the first column plot
+                ax.set_xticks(ax._xticks)
+                ax.set_xticklabels(ax._xticklabels)
+        else:
+            # Regular path for first column or non-optimized plots
+            freq, data = prepare_ts_data(data, ax, kwds)
+
+            # TODO #54485
+            ax._plot_data.append((data, self._kind, kwds))  # type: ignore[attr-defined]
+
+            lines = self._plot(
+                ax, data.index, np.asarray(data.values), style=style, **kwds
+            )
+            # set date formatter, locators and rescale limits
+            # TODO #54485
+            format_dateaxis(ax, ax.freq, data.index)  # type: ignore[arg-type, attr-defined]
 
-        lines = self._plot(ax, data.index, np.asarray(data.values), style=style, **kwds)
-        # set date formatter, locators and rescale limits
-        # TODO #54485
-        format_dateaxis(ax, ax.freq, data.index)  # type: ignore[arg-type, attr-defined]
         return lines
 
     @final
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index beaf5b6259ef3..c36fceb84ba9f 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -231,6 +231,11 @@ def _get_freq(ax: Axes, series: Series):
 
 
 def use_dynamic_x(ax: Axes, index: Index) -> bool:
+    # Cache the result of dynamic_x calculations at the axis level to avoid redundant
+    # processing for multiple columns in a DataFrame
+    if hasattr(ax, "_dynamic_x_cache") and id(index) in ax._dynamic_x_cache:  # type: ignore[attr-defined]
+        return ax._dynamic_x_cache[id(index)]  # type: ignore[attr-defined]
+
     freq = _get_index_freq(index)
     ax_freq = _get_ax_freq(ax)
 
@@ -238,15 +243,27 @@ def use_dynamic_x(ax: Axes, index: Index) -> bool:
         freq = ax_freq
     # do not use tsplot if irregular was plotted first
     elif (ax_freq is None) and (len(ax.get_lines()) > 0):
-        return False
+        result = False
+        if not hasattr(ax, "_dynamic_x_cache"):
+            ax._dynamic_x_cache = {}  # type: ignore[attr-defined]
+        ax._dynamic_x_cache[id(index)] = result  # type: ignore[attr-defined]
+        return result
 
     if freq is None:
-        return False
+        result = False
+        if not hasattr(ax, "_dynamic_x_cache"):
+            ax._dynamic_x_cache = {}  # type: ignore[attr-defined]
+        ax._dynamic_x_cache[id(index)] = result  # type: ignore[attr-defined]
+        return result
 
     freq_str = _get_period_alias(freq)
 
     if freq_str is None:
-        return False
+        result = False
+        if not hasattr(ax, "_dynamic_x_cache"):
+            ax._dynamic_x_cache = {}  # type: ignore[attr-defined]
+        ax._dynamic_x_cache[id(index)] = result  # type: ignore[attr-defined]
+        return result
 
     # FIXME: hack this for 0.10.1, creating more technical debt...sigh
     if isinstance(index, ABCDatetimeIndex):
@@ -254,11 +271,19 @@ def use_dynamic_x(ax: Axes, index: Index) -> bool:
         freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str)
         base = to_offset(freq_str, is_period=True)._period_dtype_code  # type: ignore[attr-defined]
         if base <= FreqGroup.FR_DAY.value:
-            return index[:1].is_normalized
-        period = Period(index[0], freq_str)
-        assert isinstance(period, Period)
-        return period.to_timestamp().tz_localize(index.tz) == index[0]
-    return True
+            result = index[:1].is_normalized
+        else:
+            period = Period(index[0], freq_str)
+            assert isinstance(period, Period)
+            result = period.to_timestamp().tz_localize(index.tz) == index[0]
+    else:
+        result = True
+
+    # Cache the result
+    if not hasattr(ax, "_dynamic_x_cache"):
+        ax._dynamic_x_cache = {}  # type: ignore[attr-defined]
+    ax._dynamic_x_cache[id(index)] = result  # type: ignore[attr-defined]
+    return result
 
 
 def _get_index_freq(index: Index) -> BaseOffset | None:
@@ -279,6 +304,25 @@ def maybe_convert_index(ax: Axes, data: NDFrameT) -> NDFrameT:
     # tsplot converts automatically, but don't want to convert index
     # over and over for DataFrames
     if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)):
+        # Cache the converted index on the axis to avoid redundant conversions
+        # when plotting multiple columns with the same index
+        index_id = id(data.index)
+
+        # Check if we already have a cached conversion for this index
+        if (
+            hasattr(ax, "_converted_index_cache")
+            and index_id in ax._converted_index_cache
+        ):  # type: ignore[attr-defined]
+            freq_str, converted_index = ax._converted_index_cache[index_id]  # type: ignore[attr-defined]
+
+            # Create a new object with the cached converted index
+            if isinstance(data.index, ABCDatetimeIndex):
+                return data.tz_localize(None).to_period(freq=freq_str)
+            else:  # PeriodIndex
+                result = data.copy()
+                result.index = converted_index
+                return result
+
         freq: str | BaseOffset | None = data.index.freq
 
         if freq is None:
@@ -305,10 +349,24 @@ def maybe_convert_index(ax: Axes, data: NDFrameT) -> NDFrameT:
                 category=FutureWarning,
             )
 
+            # Initialize the cache if it doesn't exist
+            if not hasattr(ax, "_converted_index_cache"):
+                ax._converted_index_cache = {}  # type: ignore[attr-defined]
+
             if isinstance(data.index, ABCDatetimeIndex):
-                data = data.tz_localize(None).to_period(freq=freq_str)
+                # Convert to period
+                converted_data = data.tz_localize(None).to_period(freq=freq_str)
+                # Cache the converted index for future use
+                ax._converted_index_cache[index_id] = (freq_str, converted_data.index)  # type: ignore[attr-defined]
+                return converted_data
             elif isinstance(data.index, ABCPeriodIndex):
-                data.index = data.index.asfreq(freq=freq_str, how="start")
+                # Asfreq the period index
+                converted_index = data.index.asfreq(freq=freq_str, how="start")
+                # Cache the converted index for future use
+                ax._converted_index_cache[index_id] = (freq_str, converted_index)  # type: ignore[attr-defined]
+                result = data.copy()
+                result.index = converted_index
+                return result
     return data
 
 
@@ -369,14 +427,27 @@ def format_dateaxis(
 def prepare_ts_data(
     series: Series, ax: Axes, kwargs: dict[str, Any]
 ) -> tuple[BaseOffset | str, Series]:
+    # Check if axes already have frequency information set up
+    # This prevents redundant setup for multi-column DataFrames with the same index
+    index_id = id(series.index)
+    ts_data_setup_done = (
+        hasattr(ax, "_ts_data_setup_done") and index_id in ax._ts_data_setup_done  # type: ignore[attr-defined]
+    )
+
     freq, data = maybe_resample(series, ax, kwargs)
 
-    # Set ax with freq info
-    decorate_axes(ax, freq)
-    # digging deeper
-    if hasattr(ax, "left_ax"):
-        decorate_axes(ax.left_ax, freq)
-    if hasattr(ax, "right_ax"):
-        decorate_axes(ax.right_ax, freq)
+    if not ts_data_setup_done:
+        # Set ax with freq info
+        decorate_axes(ax, freq)
+        # digging deeper
+        if hasattr(ax, "left_ax"):
+            decorate_axes(ax.left_ax, freq)
+        if hasattr(ax, "right_ax"):
+            decorate_axes(ax.right_ax, freq)
+
+        # Mark this index as having been set up for this axis
+        if not hasattr(ax, "_ts_data_setup_done"):
+            ax._ts_data_setup_done = set()  # type: ignore[attr-defined]
+        ax._ts_data_setup_done.add(index_id)  # type: ignore[attr-defined]
 
     return freq, data
diff --git a/pandas/tests/plotting/test_datetimeindex_performance.py b/pandas/tests/plotting/test_datetimeindex_performance.py
new file mode 100644
index 0000000000000..5f27990975b1a
--- /dev/null
+++ b/pandas/tests/plotting/test_datetimeindex_performance.py
@@ -0,0 +1,44 @@
+"""
+Tests for optimized DatetimeIndex plotting performance.
+"""
+
+import numpy as np
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    testing as tm,
+)
+
+
+@td.skip_if_no_mpl
+def test_plot_with_datetimeindex_performance():
+    """
+    Test that plotting a DataFrame with DatetimeIndex is performant.
+
+    Check that plotting multiple columns with the same DatetimeIndex
+    doesn't perform redundant calculations/conversions.
+    """
+    # Create a DataFrame with DatetimeIndex
+    rng = np.random.RandomState(42)
+    n = 1000
+    idx = pd.date_range(start="2020-01-01", periods=n, freq="D")
+
+    # Dataframe with many columns to highlight the optimization
+    df = DataFrame(rng.randn(n, 5), index=idx)
+
+    # Define function to plot in the regular/slow way
+    def plot_df():
+        ax = df.plot(figsize=(10, 5))
+        return ax
+
+    # The first run might be slower due to imports, etc.
+    # This test is primarily to catch regressions in performance
+    # rather than assert a specific speedup.
+    plot_df()  # warmup
+
+    # Assert that subsequent plotting operations are not significantly slower
+    # (verify caching is working)
+    tm.assert_faster_than(plot_df, plot_df, significant=False)

From 4cc9262fa00a36d2a21337f5717aeb2c8b0e4224 Mon Sep 17 00:00:00 2001
From: halvo <jordan@vrasa.com>
Date: Thu, 8 May 2025 23:52:17 -0400
Subject: [PATCH 2/2] DOC: Add whatsnew entry for DatetimeIndex plotting
 performance fix

---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 8695e196c4f38..274c8a683266c 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -653,6 +653,7 @@ Performance improvements
 - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
 - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
 - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
+- Performance improvement in :meth:`DataFrame.plot` when plotting DataFrames with DatetimeIndex and multiple columns (:issue:`61398`)
 - Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
 - Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
 - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)