From 389cb7001e58c0d03693ccd7fc55737cc0b3db19 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Thu, 25 Jan 2024 14:35:29 +0100
Subject: [PATCH 1/3] Correctyl expand penalties when
 cat_missing_method=convert

---
 src/glum/_glm.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 33afb37f..ca26e4e6 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2717,7 +2717,9 @@ def _set_up_and_check_fit_args(
 
                 if any(X.dtypes == "category"):
 
-                    def _expand_categorical_penalties(penalty, X, drop_first):
+                    def _expand_categorical_penalties(
+                        penalty, X, drop_first, has_missing_category
+                    ):
                         """
                         If P1 or P2 has the same shape as X before expanding the
                         categoricals, we assume that the penalty at the location of
@@ -2741,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                                     chain.from_iterable(
                                         [
                                             elmt
-                                            for _ in dtype.categories[int(drop_first) :]
+                                            for _ in range(
+                                                len(dtype.categories)
+                                                + has_missing_category[col]
+                                                - drop_first
+                                            )
                                         ]
                                         if pd.api.types.is_categorical_dtype(dtype)
                                         else [elmt]
-                                        for elmt, dtype in zip(penalty, X.dtypes)
+                                        for elmt, (col, dtype) in zip(
+                                            penalty, X.dtypes.items()
+                                        )
                                     )
                                 )
                             )
                         else:
                             return penalty
 
-                    P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
-                    P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
+                    P1 = _expand_categorical_penalties(
+                        self.P1, X, self.drop_first, self.has_missing_category_
+                    )
+                    P2 = _expand_categorical_penalties(
+                        self.P2, X, self.drop_first, self.has_missing_category_
+                    )
 
                 X = tm.from_pandas(
                     X,

From 7ddc9c2571d6879273673f78fd3ff1a09b5ba5aa Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Thu, 25 Jan 2024 14:43:36 +0100
Subject: [PATCH 2/3] Add test

---
 tests/glm/test_glm.py | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index e9d2bb3a..013890d7 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -53,7 +53,7 @@
 
 
 def get_small_x_y(
-    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
+    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
 ) -> tuple[np.ndarray, np.ndarray]:
     if isinstance(estimator, GeneralizedLinearRegressor):
         n_rows = 1
@@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
 
+def test_P1_P2_expansion_with_categoricals_missings():
+    rng = np.random.default_rng(42)
+    X = pd.DataFrame(
+        data={
+            "dense": np.linspace(0, 10, 60),
+            "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
+        }
+    )
+    y = rng.normal(size=60)
+
+    mdl1 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2, 2, 2, 2, 2],
+        P2=[2, 1, 1, 1, 1, 1],
+        cat_missing_method="convert",
+    )
+    mdl1.fit(X, y)
+
+    mdl2 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=[2, 1],
+        cat_missing_method="convert",
+    )
+    mdl2.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+
+    mdl2 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=sparse.diags([2, 1, 1, 1, 1, 1]),
+        cat_missing_method="convert",
+    )
+    mdl2.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+
+
 @pytest.mark.parametrize(
     "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
 )

From 745c5c46112b6df33d598893b5b8a7d922d91d55 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Mon, 29 Jan 2024 10:02:09 +0100
Subject: [PATCH 3/3] Improve variable names

Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
---
 tests/glm/test_glm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 013890d7..469f464e 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -389,14 +389,14 @@ def test_P1_P2_expansion_with_categoricals_missings():
     mdl2.fit(X, y)
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
-    mdl2 = GeneralizedLinearRegressor(
+    mdl3 = GeneralizedLinearRegressor(
         l1_ratio=0.01,
         P1=[1, 2],
         P2=sparse.diags([2, 1, 1, 1, 1, 1]),
         cat_missing_method="convert",
     )
-    mdl2.fit(X, y)
-    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+    mdl3.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)
 
 
 @pytest.mark.parametrize(