Skip to content

Commit 5b80a8e

Browse files
Add support for pandas 2 (#4216)
* Squashed changes * Ignored index * Disabled column checking * Reverted deleted code * Updated pyproject.toml * Replaced version check code
1 parent b398501 commit 5b80a8e

28 files changed

+147
-58
lines changed

.github/meta.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ outputs:
2525
- setuptools ==58.0.4
2626
run:
2727
- numpy >=1.21.0
28-
- pandas >=1.5.0, <2.0.0
28+
- pandas >=1.5.0
2929
- dask >=2022.2.0, !=2022.10.1
3030
- scipy >=1.5.0
3131
- scikit-learn >=1.3.0

core-requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
numpy>=1.21.0
2-
pandas>=1.5.0, <2.0.0
2+
pandas>=1.5.0
33
scipy>=1.5.0
44
scikit-learn>=1.3.0
55
scikit-optimize>=0.9.0

docs/source/release_notes.rst

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Release Notes
55
* Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233`
66
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
77
* Fixes
8+
* Added support for pandas 2 :pr:`4216`
89
* Changes
910
* Unpinned sktime version :pr:`4214`
1011
* Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237`

docs/source/user_guide/timeseries.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -996,8 +996,8 @@
996996
" ),\n",
997997
" # Plot prediction intervals\n",
998998
" go.Scatter(\n",
999-
" x=X_forecast_dates[\"Date\"].append(X_forecast_dates[\"Date\"][::-1]),\n",
1000-
" y=y_upper.append(y_lower[::-1]),\n",
999+
" x=pd.concat([X_forecast_dates[\"Date\"], X_forecast_dates[\"Date\"][::-1]]),\n",
1000+
" y=pd.concat([y_upper, y_lower[::-1]]),\n",
10011001
" fill=\"toself\",\n",
10021002
" fillcolor=\"rgba(255,0,0,0.2)\",\n",
10031003
" line=dict(color=\"rgba(255,0,0,0.2)\"),\n",

evalml/model_understanding/visualizations.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,8 @@ def get_linear_coefficients(estimator, features=None):
472472
coef_.name = "Coefficients"
473473
coef_.index = features
474474
coef_ = coef_.sort_values()
475-
coef_ = pd.Series(estimator._component_obj.intercept_, index=["Intercept"]).append(
476-
coef_,
475+
coef_ = pd.concat(
476+
[pd.Series(estimator._component_obj.intercept_, index=["Intercept"]), coef_],
477477
)
478478

479479
return coef_

evalml/pipelines/components/transformers/encoders/onehot_encoder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def fit(self, X, y=None):
152152
random_state=self._initial_state,
153153
)
154154
value_counts = value_counts.sort_values(
155-
[col],
155+
value_counts.iloc[:, 0].name,
156156
ascending=False,
157157
kind="mergesort",
158158
)

evalml/pipelines/components/transformers/preprocessing/decomposer.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,9 @@ def determine_periodicity(
151151
period is detected, returns None.
152152
153153
"""
154-
X, y = cls._handle_nullable_types(cls, X, y)
154+
# Only need to handle nullable types on pandas < 2. Kept for backwards compatibility with pandas 1.x.
155+
if int(pd.__version__.split(".")[0]) < 2:
156+
X, y = cls._handle_nullable_types(cls, X, y)
155157

156158
def _get_rel_max_from_acf(y):
157159
"""Determines the relative maxima of the target's autocorrelation."""

evalml/pipelines/components/transformers/preprocessing/polynomial_decomposer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
267267
index=truncated_y_t.index,
268268
),
269269
)
270-
y = y_in_sample.append(y_out_of_sample)
270+
y = pd.concat([y_in_sample, y_out_of_sample])
271271
y.index = original_index
272272
return y
273273

evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def transform(
245245
index=truncated_y.index,
246246
),
247247
)
248-
y_t = y_in_sample.append(y_out_of_sample)
248+
y_t = pd.concat([y_in_sample, y_out_of_sample])
249249
y_t.index = original_index
250250
return X, y_t
251251

@@ -317,7 +317,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
317317
index=truncated_y_t.index,
318318
),
319319
)
320-
y = y_in_sample.append(y_out_of_sample)
320+
y = pd.concat([y_in_sample, y_out_of_sample])
321321
y.index = original_index
322322
return y
323323

evalml/preprocessing/utils.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -184,19 +184,17 @@ def target_distribution(targets):
184184
185185
Examples:
186186
>>> y = pd.Series([1, 2, 4, 1, 3, 3, 1, 2])
187-
>>> target_distribution(y)
187+
>>> print(target_distribution(y).to_string())
188188
Targets
189189
1 37.50%
190190
2 25.00%
191191
3 25.00%
192192
4 12.50%
193-
dtype: object
194193
>>> y = pd.Series([True, False, False, False, True])
195-
>>> target_distribution(y)
194+
>>> print(target_distribution(y).to_string())
196195
Targets
197196
False 60.00%
198197
True 40.00%
199-
dtype: object
200198
"""
201199
distribution = targets.value_counts() / len(targets)
202200
return distribution.mul(100).apply("{:.2f}%".format).rename_axis("Targets")

evalml/tests/component_tests/decomposer_tests/test_decomposer.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -292,9 +292,8 @@ def test_decomposer_build_seasonal_signal(
292292
X, _, y = ts_data()
293293

294294
# Change the date time index to start at the same time but have different frequency
295-
y.set_axis(
295+
y = y.set_axis(
296296
pd.date_range(start="2021-01-01", periods=len(y), freq=frequency),
297-
inplace=True,
298297
)
299298

300299
decomposer = decomposer_child_class(degree=2)
@@ -497,7 +496,12 @@ def test_decomposer_determine_periodicity(
497496
True,
498497
pytest.param(
499498
False,
500-
marks=pytest.mark.xfail(strict=True, raises=AssertionError),
499+
marks=pytest.mark.xfail(
500+
condition=int(pd.__version__.split(".")[0]) < 2,
501+
strict=True,
502+
raises=AssertionError,
503+
reason="pandas 1.x does not recognize np.Nan in Float64 subtracted_floats.",
504+
),
501505
),
502506
],
503507
)
@@ -749,12 +753,20 @@ def test_decomposer_inverse_transform(
749753
output_inverse_y = decomposer.inverse_transform(y_t_new)
750754
else:
751755
output_inverse_y = decomposer.inverse_transform(y_t_new)
756+
# Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows,
757+
# we need to test the indices equivalence separately.
752758
pd.testing.assert_series_equal(
753759
y[y_t_new.index],
754760
output_inverse_y,
755761
check_exact=False,
762+
check_index=False,
756763
rtol=1.0e-1,
757764
)
765+
pd.testing.assert_index_equal(
766+
y[y_t_new.index].index,
767+
output_inverse_y.index,
768+
exact=False,
769+
)
758770

759771

760772
@pytest.mark.parametrize(

evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -181,13 +181,20 @@ def test_stl_decomposer_inverse_transform(
181181
):
182182
output_inverse_y = decomposer.inverse_transform(y_t_new)
183183
else:
184+
# Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows,
185+
# we need to test the indices equivalence separately.
184186
output_inverse_y = decomposer.inverse_transform(y_t_new)
185187
pd.testing.assert_series_equal(
186188
y[y_t_new.index],
187189
output_inverse_y,
188-
check_exact=False,
190+
check_index=False,
189191
rtol=1.0e-2,
190192
)
193+
pd.testing.assert_index_equal(
194+
y[y_t_new.index].index,
195+
output_inverse_y.index,
196+
exact=False,
197+
)
191198

192199

193200
@pytest.mark.parametrize(

evalml/tests/component_tests/test_datetime_featurizer.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ def test_datetime_featurizer_encodes_as_ints():
7777
# Test that changing encode_as_categories to True only changes the dtypes but not the values
7878
dt_with_cats = DateTimeFeaturizer(encode_as_categories=True)
7979
X_transformed_df = dt_with_cats.fit_transform(X)
80-
expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0])
81-
expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5])
80+
expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0]).astype("category")
81+
expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5]).astype("category")
8282

83-
assert_frame_equal(expected, X_transformed_df)
83+
assert_frame_equal(expected, X_transformed_df, check_categorical=False)
8484
assert dt_with_cats.get_feature_names() == feature_names
8585

8686
# Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined
@@ -250,7 +250,10 @@ def test_datetime_featurizer_no_datetime_cols():
250250

251251
def test_datetime_featurizer_numpy_array_input():
252252
datetime_transformer = DateTimeFeaturizer()
253-
X = np.array([["2007-02-03"], ["2016-06-07"], ["2020-05-19"]], dtype="datetime64")
253+
X = np.array(
254+
[["2007-02-03"], ["2016-06-07"], ["2020-05-19"]],
255+
dtype="datetime64[ns]",
256+
)
254257
datetime_transformer.fit(X)
255258
assert list(datetime_transformer.transform(X).columns) == [
256259
"0_year",

evalml/tests/component_tests/test_imputer.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -219,11 +219,21 @@ def test_drop_all_columns(imputer_test_data):
219219
imputer.fit(X, y)
220220
transformed = imputer.transform(X, y)
221221
expected = X.drop(["all nan cat", "all nan"], axis=1)
222-
assert_frame_equal(transformed, expected, check_dtype=False)
222+
assert_frame_equal(
223+
transformed,
224+
expected,
225+
check_column_type=False,
226+
check_index_type=False,
227+
)
223228

224229
imputer = Imputer()
225230
transformed = imputer.fit_transform(X, y)
226-
assert_frame_equal(transformed, expected, check_dtype=False)
231+
assert_frame_equal(
232+
transformed,
233+
expected,
234+
check_column_type=False,
235+
check_index_type=False,
236+
)
227237

228238

229239
def test_typed_imputer_numpy_input():
@@ -271,11 +281,21 @@ def test_imputer_empty_data(data_type, make_data_type):
271281
imputer = Imputer()
272282
imputer.fit(X, y)
273283
transformed = imputer.transform(X, y)
274-
assert_frame_equal(transformed, expected, check_dtype=False)
284+
assert_frame_equal(
285+
transformed,
286+
expected,
287+
check_column_type=False,
288+
check_index_type=False,
289+
)
275290

276291
imputer = Imputer()
277292
transformed = imputer.fit_transform(X, y)
278-
assert_frame_equal(transformed, expected, check_dtype=False)
293+
assert_frame_equal(
294+
transformed,
295+
expected,
296+
check_column_type=False,
297+
check_index_type=False,
298+
)
279299

280300

281301
def test_imputer_does_not_reset_index():
@@ -508,7 +528,9 @@ def test_imputer_with_none_separated(
508528
for col in set(columns_dict["categoricals_only"]).intersection(
509529
set(X_test.columns),
510530
):
511-
expected_df[col].cat.add_categories(categorical_fill_value, inplace=True)
531+
expected_df[col] = expected_df[col].cat.add_categories(
532+
categorical_fill_value,
533+
)
512534
expected_df[col].iloc[-1:] = categorical_fill_value
513535
if boolean_impute_strategy == "constant":
514536
for col in set(columns_dict["booleans_only"]).intersection(set(X_test.columns)):

evalml/tests/component_tests/test_lgbm_classifier.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,25 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary):
164164

165165
clf.predict(X)
166166
arg_X = mock_predict.call_args[0][0]
167-
assert_frame_equal(X_expected, arg_X)
167+
# Index type checking ignored so the test can pass on Windows
168+
# X_expected is int32, arg_X is int64
169+
assert_frame_equal(
170+
X_expected,
171+
arg_X,
172+
check_index_type=False,
173+
check_column_type=False,
174+
)
168175

169176
clf.predict_proba(X)
170177
arg_X = mock_predict_proba.call_args[0][0]
171-
assert_frame_equal(X_expected, arg_X)
178+
# Index type checking ignored so the test can pass on Windows
179+
# X_expected is int32, arg_X is int64
180+
assert_frame_equal(
181+
X_expected,
182+
arg_X,
183+
check_index_type=False,
184+
check_column_type=False,
185+
)
172186

173187

174188
@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba")

evalml/tests/component_tests/test_lgbm_regressor.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,14 @@ def test_correct_args(mock_predict, X_y_regression):
118118

119119
clf.predict(X)
120120
arg_X = mock_predict.call_args[0][0]
121-
assert_frame_equal(X_expected, arg_X)
121+
# Index type checking ignored so the test can pass on Windows
122+
# X_expected is int32, arg_X is int64
123+
assert_frame_equal(
124+
X_expected,
125+
arg_X,
126+
check_index_type=False,
127+
check_column_type=False,
128+
)
122129

123130

124131
@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict")

evalml/tests/component_tests/test_one_hot_encoder.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ def test_more_top_n_unique_values():
420420
col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
421421
col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
422422
col_1_counts = col_1_counts.sort_values(
423-
["col_1"],
423+
col_1_counts.iloc[:, 0].name,
424424
ascending=False,
425425
kind="mergesort",
426426
)
@@ -429,7 +429,7 @@ def test_more_top_n_unique_values():
429429
col_2_counts = X["col_2"].value_counts(dropna=False).to_frame()
430430
col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed)
431431
col_2_counts = col_2_counts.sort_values(
432-
["col_2"],
432+
col_2_counts.iloc[:, 0].name,
433433
ascending=False,
434434
kind="mergesort",
435435
)
@@ -466,7 +466,7 @@ def test_more_top_n_unique_values_large():
466466
col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
467467
col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
468468
col_1_counts = col_1_counts.sort_values(
469-
["col_1"],
469+
col_1_counts.iloc[:, 0].name,
470470
ascending=False,
471471
kind="mergesort",
472472
)

evalml/tests/component_tests/test_oversampler.py

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def test_oversample_imbalanced_binary(data_type, oversampler_type, make_data_typ
109109
value_counts,
110110
pd.Series([850, 850]),
111111
check_dtype=False,
112+
check_names=False,
112113
)
113114

114115
oversampler = Oversampler(sampling_ratio=1)

evalml/tests/component_tests/test_simple_imputer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -557,14 +557,14 @@ def test_simple_imputer_ignores_natural_language(
557557

558558
if df_composition == "full_df":
559559
if numeric_impute_strategy == "mean" and has_nan == "has_nan":
560-
ans = X_df.mean()
560+
ans = X_df.mean(numeric_only=True)
561561
ans["natural language col"] = pd.NA
562562
X_df = X_df.astype(
563563
{"int col": float},
564564
)
565565
X_df.iloc[-1, :] = ans
566566
elif numeric_impute_strategy == "median" and has_nan == "has_nan":
567-
ans = X_df.median()
567+
ans = X_df.median(numeric_only=True)
568568
ans["natural language col"] = pd.NA
569569
X_df = X_df.astype(
570570
{"int col": float},

evalml/tests/component_tests/test_target_encoder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def test_cols():
144144
),
145145
},
146146
)
147-
assert_frame_equal(X_expected, X_t, check_less_precise=True)
147+
assert_frame_equal(X_expected, X_t, rtol=1e-3)
148148

149149
encoder = TargetEncoder(cols=["col_3"])
150150
encoder.fit(X, y)

0 commit comments

Comments
 (0)