diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 33afb37f..ca26e4e6 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2717,7 +2717,9 @@ def _set_up_and_check_fit_args( if any(X.dtypes == "category"): - def _expand_categorical_penalties(penalty, X, drop_first): + def _expand_categorical_penalties( + penalty, X, drop_first, has_missing_category + ): """ If P1 or P2 has the same shape as X before expanding the categoricals, we assume that the penalty at the location of @@ -2741,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first): chain.from_iterable( [ elmt - for _ in dtype.categories[int(drop_first) :] + for _ in range( + len(dtype.categories) + + has_missing_category[col] + - drop_first + ) ] if pd.api.types.is_categorical_dtype(dtype) else [elmt] - for elmt, dtype in zip(penalty, X.dtypes) + for elmt, (col, dtype) in zip( + penalty, X.dtypes.items() + ) ) ) ) else: return penalty - P1 = _expand_categorical_penalties(self.P1, X, self.drop_first) - P2 = _expand_categorical_penalties(self.P2, X, self.drop_first) + P1 = _expand_categorical_penalties( + self.P1, X, self.drop_first, self.has_missing_category_ + ) + P2 = _expand_categorical_penalties( + self.P2, X, self.drop_first, self.has_missing_category_ + ) X = tm.from_pandas( X, diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index e9d2bb3a..469f464e 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -53,7 +53,7 @@ def get_small_x_y( - estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] + estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV], ) -> tuple[np.ndarray, np.ndarray]: if isinstance(estimator, GeneralizedLinearRegressor): n_rows = 1 @@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals(): np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) +def test_P1_P2_expansion_with_categoricals_missings(): + rng = np.random.default_rng(42) + X = pd.DataFrame( + data={ + "dense": np.linspace(0, 10, 60), + "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0), + } + ) + y = rng.normal(size=60) + + mdl1 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2, 2, 2, 2, 2], + P2=[2, 1, 1, 1, 1, 1], + cat_missing_method="convert", + ) + mdl1.fit(X, y) + + mdl2 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=[2, 1], + cat_missing_method="convert", + ) + mdl2.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) + + mdl3 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=sparse.diags([2, 1, 1, 1, 1, 1]), + cat_missing_method="convert", + ) + mdl3.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl3.coef_) + + @pytest.mark.parametrize( "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] )