From 389cb7001e58c0d03693ccd7fc55737cc0b3db19 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 25 Jan 2024 14:35:29 +0100 Subject: [PATCH 1/3] Correctyl expand penalties when cat_missing_method=convert --- src/glum/_glm.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 33afb37f..ca26e4e6 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2717,7 +2717,9 @@ def _set_up_and_check_fit_args( if any(X.dtypes == "category"): - def _expand_categorical_penalties(penalty, X, drop_first): + def _expand_categorical_penalties( + penalty, X, drop_first, has_missing_category + ): """ If P1 or P2 has the same shape as X before expanding the categoricals, we assume that the penalty at the location of @@ -2741,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first): chain.from_iterable( [ elmt - for _ in dtype.categories[int(drop_first) :] + for _ in range( + len(dtype.categories) + + has_missing_category[col] + - drop_first + ) ] if pd.api.types.is_categorical_dtype(dtype) else [elmt] - for elmt, dtype in zip(penalty, X.dtypes) + for elmt, (col, dtype) in zip( + penalty, X.dtypes.items() + ) ) ) ) else: return penalty - P1 = _expand_categorical_penalties(self.P1, X, self.drop_first) - P2 = _expand_categorical_penalties(self.P2, X, self.drop_first) + P1 = _expand_categorical_penalties( + self.P1, X, self.drop_first, self.has_missing_category_ + ) + P2 = _expand_categorical_penalties( + self.P2, X, self.drop_first, self.has_missing_category_ + ) X = tm.from_pandas( X, From 7ddc9c2571d6879273673f78fd3ff1a09b5ba5aa Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 25 Jan 2024 14:43:36 +0100 Subject: [PATCH 2/3] Add test --- tests/glm/test_glm.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index e9d2bb3a..013890d7 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -53,7 +53,7 @@ def get_small_x_y( - estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] + estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV], ) -> tuple[np.ndarray, np.ndarray]: if isinstance(estimator, GeneralizedLinearRegressor): n_rows = 1 @@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals(): np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) +def test_P1_P2_expansion_with_categoricals_missings(): + rng = np.random.default_rng(42) + X = pd.DataFrame( + data={ + "dense": np.linspace(0, 10, 60), + "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0), + } + ) + y = rng.normal(size=60) + + mdl1 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2, 2, 2, 2, 2], + P2=[2, 1, 1, 1, 1, 1], + cat_missing_method="convert", + ) + mdl1.fit(X, y) + + mdl2 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=[2, 1], + cat_missing_method="convert", + ) + mdl2.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) + + mdl2 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=sparse.diags([2, 1, 1, 1, 1, 1]), + cat_missing_method="convert", + ) + mdl2.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) + + @pytest.mark.parametrize( "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] ) From 745c5c46112b6df33d598893b5b8a7d922d91d55 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 29 Jan 2024 10:02:09 +0100 Subject: [PATCH 3/3] Improve variable names Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com> --- tests/glm/test_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 013890d7..469f464e 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -389,14 +389,14 @@ def test_P1_P2_expansion_with_categoricals_missings(): mdl2.fit(X, y) np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) - mdl2 = GeneralizedLinearRegressor( + mdl3 = GeneralizedLinearRegressor( l1_ratio=0.01, P1=[1, 2], P2=sparse.diags([2, 1, 1, 1, 1, 1]), cat_missing_method="convert", ) - mdl2.fit(X, y) - np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) + mdl3.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl3.coef_) @pytest.mark.parametrize(