Add support for pandas 2 (#4216)

christopherbunn · web-flow · commit 5b80a8e53ed8 · 2023-07-27T17:43:49.000Z
* Squashed changes

* Ignored index

* Disabled column checking

* Reverted deleted code

* Updated pyproject.toml

* Replaced version check code
diff --git a/.github/meta.yaml b/.github/meta.yaml
@@ -25,7 +25,7 @@ outputs:
         - setuptools ==58.0.4
       run:
         - numpy >=1.21.0
-        - pandas >=1.5.0, <2.0.0
+        - pandas >=1.5.0
         - dask >=2022.2.0, !=2022.10.1
         - scipy >=1.5.0
         - scikit-learn >=1.3.0
diff --git a/core-requirements.txt b/core-requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.21.0
-pandas>=1.5.0, <2.0.0
+pandas>=1.5.0
 scipy>=1.5.0
 scikit-learn>=1.3.0
 scikit-optimize>=0.9.0
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,6 +5,7 @@ Release Notes
         * Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233`
         * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
     * Fixes
+        * Added support for pandas 2 :pr:`4216`
     * Changes
         * Unpinned sktime version :pr:`4214`
         * Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237`
diff --git a/docs/source/user_guide/timeseries.ipynb b/docs/source/user_guide/timeseries.ipynb
@@ -996,8 +996,8 @@
     "        ),\n",
     "        # Plot prediction intervals\n",
     "        go.Scatter(\n",
-    "            x=X_forecast_dates[\"Date\"].append(X_forecast_dates[\"Date\"][::-1]),\n",
-    "            y=y_upper.append(y_lower[::-1]),\n",
+    "            x=pd.concat([X_forecast_dates[\"Date\"], X_forecast_dates[\"Date\"][::-1]]),\n",
+    "            y=pd.concat([y_upper, y_lower[::-1]]),\n",
     "            fill=\"toself\",\n",
     "            fillcolor=\"rgba(255,0,0,0.2)\",\n",
     "            line=dict(color=\"rgba(255,0,0,0.2)\"),\n",
diff --git a/evalml/model_understanding/visualizations.py b/evalml/model_understanding/visualizations.py
@@ -472,8 +472,8 @@ def get_linear_coefficients(estimator, features=None):
     coef_.name = "Coefficients"
     coef_.index = features
     coef_ = coef_.sort_values()
-    coef_ = pd.Series(estimator._component_obj.intercept_, index=["Intercept"]).append(
-        coef_,
+    coef_ = pd.concat(
+        [pd.Series(estimator._component_obj.intercept_, index=["Intercept"]), coef_],
     )
 
     return coef_
diff --git a/evalml/pipelines/components/transformers/encoders/onehot_encoder.py b/evalml/pipelines/components/transformers/encoders/onehot_encoder.py
@@ -152,7 +152,7 @@ def fit(self, X, y=None):
                         random_state=self._initial_state,
                     )
                     value_counts = value_counts.sort_values(
-                        [col],
+                        value_counts.iloc[:, 0].name,
                         ascending=False,
                         kind="mergesort",
                     )
diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py
@@ -151,7 +151,9 @@ def determine_periodicity(
                 period is detected, returns None.
 
         """
-        X, y = cls._handle_nullable_types(cls, X, y)
+        # Only need to handle nullable types on pandas < 2. Kept for backwards compatibility with pandas 1.x.
+        if int(pd.__version__.split(".")[0]) < 2:
+            X, y = cls._handle_nullable_types(cls, X, y)
 
         def _get_rel_max_from_acf(y):
             """Determines the relative maxima of the target's autocorrelation."""
diff --git a/evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py
@@ -267,7 +267,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
                     index=truncated_y_t.index,
                 ),
             )
-        y = y_in_sample.append(y_out_of_sample)
+        y = pd.concat([y_in_sample, y_out_of_sample])
         y.index = original_index
         return y
 
diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py
@@ -245,7 +245,7 @@ def transform(
                     index=truncated_y.index,
                 ),
             )
-        y_t = y_in_sample.append(y_out_of_sample)
+        y_t = pd.concat([y_in_sample, y_out_of_sample])
         y_t.index = original_index
         return X, y_t
 
@@ -317,7 +317,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
                     index=truncated_y_t.index,
                 ),
             )
-        y = y_in_sample.append(y_out_of_sample)
+        y = pd.concat([y_in_sample, y_out_of_sample])
         y.index = original_index
         return y
 
diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -184,19 +184,17 @@ def target_distribution(targets):
 
     Examples:
         >>> y = pd.Series([1, 2, 4, 1, 3, 3, 1, 2])
-        >>> target_distribution(y)
+        >>> print(target_distribution(y).to_string())
         Targets
         1    37.50%
         2    25.00%
         3    25.00%
         4    12.50%
-        dtype: object
         >>> y = pd.Series([True, False, False, False, True])
-        >>> target_distribution(y)
+        >>> print(target_distribution(y).to_string())
         Targets
         False    60.00%
         True     40.00%
-        dtype: object
     """
     distribution = targets.value_counts() / len(targets)
     return distribution.mul(100).apply("{:.2f}%".format).rename_axis("Targets")
diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py
@@ -292,9 +292,8 @@ def test_decomposer_build_seasonal_signal(
     X, _, y = ts_data()
 
     # Change the date time index to start at the same time but have different frequency
-    y.set_axis(
+    y = y.set_axis(
         pd.date_range(start="2021-01-01", periods=len(y), freq=frequency),
-        inplace=True,
     )
 
     decomposer = decomposer_child_class(degree=2)
@@ -497,7 +496,12 @@ def test_decomposer_determine_periodicity(
         True,
         pytest.param(
             False,
-            marks=pytest.mark.xfail(strict=True, raises=AssertionError),
+            marks=pytest.mark.xfail(
+                condition=int(pd.__version__.split(".")[0]) < 2,
+                strict=True,
+                raises=AssertionError,
+                reason="pandas 1.x does not recognize np.Nan in Float64 subtracted_floats.",
+            ),
         ),
     ],
 )
@@ -749,12 +753,20 @@ def test_decomposer_inverse_transform(
                 output_inverse_y = decomposer.inverse_transform(y_t_new)
         else:
             output_inverse_y = decomposer.inverse_transform(y_t_new)
+            # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows,
+            # we need to test the indices equivalence separately.
             pd.testing.assert_series_equal(
                 y[y_t_new.index],
                 output_inverse_y,
                 check_exact=False,
+                check_index=False,
                 rtol=1.0e-1,
             )
+            pd.testing.assert_index_equal(
+                y[y_t_new.index].index,
+                output_inverse_y.index,
+                exact=False,
+            )
 
 
 @pytest.mark.parametrize(
diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py
@@ -181,13 +181,20 @@ def test_stl_decomposer_inverse_transform(
             ):
                 output_inverse_y = decomposer.inverse_transform(y_t_new)
         else:
+            # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows,
+            # we need to test the indices equivalence separately.
             output_inverse_y = decomposer.inverse_transform(y_t_new)
             pd.testing.assert_series_equal(
                 y[y_t_new.index],
                 output_inverse_y,
-                check_exact=False,
+                check_index=False,
                 rtol=1.0e-2,
             )
+            pd.testing.assert_index_equal(
+                y[y_t_new.index].index,
+                output_inverse_y.index,
+                exact=False,
+            )
 
 
 @pytest.mark.parametrize(
diff --git a/evalml/tests/component_tests/test_datetime_featurizer.py b/evalml/tests/component_tests/test_datetime_featurizer.py
@@ -77,10 +77,10 @@ def test_datetime_featurizer_encodes_as_ints():
     # Test that changing encode_as_categories to True only changes the dtypes but not the values
     dt_with_cats = DateTimeFeaturizer(encode_as_categories=True)
     X_transformed_df = dt_with_cats.fit_transform(X)
-    expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0])
-    expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5])
+    expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0]).astype("category")
+    expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5]).astype("category")
 
-    assert_frame_equal(expected, X_transformed_df)
+    assert_frame_equal(expected, X_transformed_df, check_categorical=False)
     assert dt_with_cats.get_feature_names() == feature_names
 
     # Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined
@@ -250,7 +250,10 @@ def test_datetime_featurizer_no_datetime_cols():
 
 def test_datetime_featurizer_numpy_array_input():
     datetime_transformer = DateTimeFeaturizer()
-    X = np.array([["2007-02-03"], ["2016-06-07"], ["2020-05-19"]], dtype="datetime64")
+    X = np.array(
+        [["2007-02-03"], ["2016-06-07"], ["2020-05-19"]],
+        dtype="datetime64[ns]",
+    )
     datetime_transformer.fit(X)
     assert list(datetime_transformer.transform(X).columns) == [
         "0_year",
diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
@@ -219,11 +219,21 @@ def test_drop_all_columns(imputer_test_data):
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = X.drop(["all nan cat", "all nan"], axis=1)
-    assert_frame_equal(transformed, expected, check_dtype=False)
+    assert_frame_equal(
+        transformed,
+        expected,
+        check_column_type=False,
+        check_index_type=False,
+    )
 
     imputer = Imputer()
     transformed = imputer.fit_transform(X, y)
-    assert_frame_equal(transformed, expected, check_dtype=False)
+    assert_frame_equal(
+        transformed,
+        expected,
+        check_column_type=False,
+        check_index_type=False,
+    )
 
 
 def test_typed_imputer_numpy_input():
@@ -271,11 +281,21 @@ def test_imputer_empty_data(data_type, make_data_type):
     imputer = Imputer()
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
-    assert_frame_equal(transformed, expected, check_dtype=False)
+    assert_frame_equal(
+        transformed,
+        expected,
+        check_column_type=False,
+        check_index_type=False,
+    )
 
     imputer = Imputer()
     transformed = imputer.fit_transform(X, y)
-    assert_frame_equal(transformed, expected, check_dtype=False)
+    assert_frame_equal(
+        transformed,
+        expected,
+        check_column_type=False,
+        check_index_type=False,
+    )
 
 
 def test_imputer_does_not_reset_index():
@@ -508,7 +528,9 @@ def test_imputer_with_none_separated(
         for col in set(columns_dict["categoricals_only"]).intersection(
             set(X_test.columns),
         ):
-            expected_df[col].cat.add_categories(categorical_fill_value, inplace=True)
+            expected_df[col] = expected_df[col].cat.add_categories(
+                categorical_fill_value,
+            )
             expected_df[col].iloc[-1:] = categorical_fill_value
     if boolean_impute_strategy == "constant":
         for col in set(columns_dict["booleans_only"]).intersection(set(X_test.columns)):
diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py
@@ -164,11 +164,25 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary):
 
     clf.predict(X)
     arg_X = mock_predict.call_args[0][0]
-    assert_frame_equal(X_expected, arg_X)
+    # Index type checking ignored so the test can pass on Windows
+    # X_expected is int32, arg_X is int64
+    assert_frame_equal(
+        X_expected,
+        arg_X,
+        check_index_type=False,
+        check_column_type=False,
+    )
 
     clf.predict_proba(X)
     arg_X = mock_predict_proba.call_args[0][0]
-    assert_frame_equal(X_expected, arg_X)
+    # Index type checking ignored so the test can pass on Windows
+    # X_expected is int32, arg_X is int64
+    assert_frame_equal(
+        X_expected,
+        arg_X,
+        check_index_type=False,
+        check_column_type=False,
+    )
 
 
 @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba")
diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py
@@ -118,7 +118,14 @@ def test_correct_args(mock_predict, X_y_regression):
 
     clf.predict(X)
     arg_X = mock_predict.call_args[0][0]
-    assert_frame_equal(X_expected, arg_X)
+    # Index type checking ignored so the test can pass on Windows
+    # X_expected is int32, arg_X is int64
+    assert_frame_equal(
+        X_expected,
+        arg_X,
+        check_index_type=False,
+        check_column_type=False,
+    )
 
 
 @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict")
diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -420,7 +420,7 @@ def test_more_top_n_unique_values():
     col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
     col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
     col_1_counts = col_1_counts.sort_values(
-        ["col_1"],
+        col_1_counts.iloc[:, 0].name,
         ascending=False,
         kind="mergesort",
     )
@@ -429,7 +429,7 @@ def test_more_top_n_unique_values():
     col_2_counts = X["col_2"].value_counts(dropna=False).to_frame()
     col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed)
     col_2_counts = col_2_counts.sort_values(
-        ["col_2"],
+        col_2_counts.iloc[:, 0].name,
         ascending=False,
         kind="mergesort",
     )
@@ -466,7 +466,7 @@ def test_more_top_n_unique_values_large():
     col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
     col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
     col_1_counts = col_1_counts.sort_values(
-        ["col_1"],
+        col_1_counts.iloc[:, 0].name,
         ascending=False,
         kind="mergesort",
     )
diff --git a/evalml/tests/component_tests/test_oversampler.py b/evalml/tests/component_tests/test_oversampler.py
@@ -109,6 +109,7 @@ def test_oversample_imbalanced_binary(data_type, oversampler_type, make_data_typ
         value_counts,
         pd.Series([850, 850]),
         check_dtype=False,
+        check_names=False,
     )
 
     oversampler = Oversampler(sampling_ratio=1)
diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
@@ -557,14 +557,14 @@ def test_simple_imputer_ignores_natural_language(
 
     if df_composition == "full_df":
         if numeric_impute_strategy == "mean" and has_nan == "has_nan":
-            ans = X_df.mean()
+            ans = X_df.mean(numeric_only=True)
             ans["natural language col"] = pd.NA
             X_df = X_df.astype(
                 {"int col": float},
             )
             X_df.iloc[-1, :] = ans
         elif numeric_impute_strategy == "median" and has_nan == "has_nan":
-            ans = X_df.median()
+            ans = X_df.median(numeric_only=True)
             ans["natural language col"] = pd.NA
             X_df = X_df.astype(
                 {"int col": float},
diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py
@@ -144,7 +144,7 @@ def test_cols():
             ),
         },
     )
-    assert_frame_equal(X_expected, X_t, check_less_precise=True)
+    assert_frame_equal(X_expected, X_t, rtol=1e-3)
 
     encoder = TargetEncoder(cols=["col_3"])
     encoder.fit(X, y)
diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py
diff --git a/evalml/tests/component_tests/test_time_series_regularizer.py b/evalml/tests/component_tests/test_time_series_regularizer.py
diff --git a/evalml/tests/component_tests/test_undersampler.py b/evalml/tests/component_tests/test_undersampler.py
diff --git a/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py b/evalml/tests/data_checks_tests/test_class_imbalance_data_check.py
diff --git a/evalml/tests/data_checks_tests/test_datetime_format_data_check.py b/evalml/tests/data_checks_tests/test_datetime_format_data_check.py
diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
diff --git a/evalml/tests/model_understanding_tests/test_visualizations.py b/evalml/tests/model_understanding_tests/test_visualizations.py
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ def fit(self, X, y=None):`
`152`	`152`	`random_state=self._initial_state,`
`153`	`153`	`)`
`154`	`154`	`value_counts = value_counts.sort_values(`
`155`		`- [col],`
	`155`	`+ value_counts.iloc[:, 0].name,`
`156`	`156`	`ascending=False,`
`157`	`157`	`kind="mergesort",`
`158`	`158`	`)`
Original file line number	Diff line number	Diff line change
`@@ -267,7 +267,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:`
`267`	`267`	`index=truncated_y_t.index,`
`268`	`268`	`),`
`269`	`269`	`)`
`270`		`- y = y_in_sample.append(y_out_of_sample)`
	`270`	`+ y = pd.concat([y_in_sample, y_out_of_sample])`
`271`	`271`	`y.index = original_index`
`272`	`272`	`return y`
`273`	`273`
Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ def test_oversample_imbalanced_binary(data_type, oversampler_type, make_data_typ`
`109`	`109`	`value_counts,`
`110`	`110`	`pd.Series([850, 850]),`
`111`	`111`	`check_dtype=False,`
	`112`	`+ check_names=False,`
`112`	`113`	`)`
`113`	`114`
`114`	`115`	`oversampler = Oversampler(sampling_ratio=1)`
Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ def test_cols():`
`144`	`144`	`),`
`145`	`145`	`},`
`146`	`146`	`)`
`147`		`- assert_frame_equal(X_expected, X_t, check_less_precise=True)`
	`147`	`+ assert_frame_equal(X_expected, X_t, rtol=1e-3)`
`148`	`148`
`149`	`149`	`encoder = TargetEncoder(cols=["col_3"])`
`150`	`150`	`encoder.fit(X, y)`