From 4d574349ffcf93eacba66ac349b025f5355a5dba Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 10 Jan 2024 16:06:15 +0100
Subject: [PATCH 01/23] drop missings not seen in training

---
 src/glum/_glm.py      |  7 +++++--
 tests/glm/test_glm.py | 15 ++++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 5b2f9b0d2..7d3c80d63 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -880,10 +880,11 @@ def _convert_from_pandas(
         self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
     ) -> tm.MatrixBase:
         """Convert a pandas data frame to a tabmat matrix."""
-
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
+        cat_missing_method = self.cat_missing_method
+
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
             if self.cat_missing_method == "convert":
@@ -894,12 +895,14 @@ def _convert_from_pandas(
                     cat_missing_name=self.cat_missing_name,
                     categorical_format=self.categorical_format,
                 )
+                # drop categories that were not seen in training
+                cat_missing_method = "drop"
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
-            cat_missing_method=self.cat_missing_method,
+            cat_missing_method=cat_missing_method,
         )
 
         return X
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 590ca6780..a8d998b30 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2967,7 +2967,9 @@ def get_mixed_data():
         pytest.param("y ~ c1 + 1", id="categorical_intercept"),
         pytest.param("y ~ x1 * c1 * c2", id="interaction"),
         pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_and_categorical"),
-        pytest.param("y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"),
+        pytest.param(
+            "y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"
+        ),
     ],
 )
 @pytest.mark.parametrize(
@@ -3158,20 +3160,23 @@ def test_formula_predict(get_mixed_data, formula):
 
 
 @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
-def test_cat_missing(cat_missing_method):
+@pytest.mark.parametrize("unseen_missing", [False, True])
+def test_cat_missing(cat_missing_method, unseen_missing):
     X = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
             "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
         }
     )
+    if unseen_missing:
+        X = X.dropna()
     X_unseen = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, pd.NA]),
             "cat_2": pd.Categorical([1, 2]),
         }
     )
-    y = np.array([1, 2, 3, 4, 5])
+    y = np.array(X.index)
 
     model = GeneralizedLinearRegressor(
         family="normal",
@@ -3180,14 +3185,14 @@ def test_cat_missing(cat_missing_method):
         fit_intercept=False,
     )
 
-    if cat_missing_method == "fail":
+    if cat_missing_method == "fail" and not unseen_missing:
         with pytest.raises(ValueError):
             model.fit(X, y)
     else:
         model.fit(X, y)
         feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
 
-        if cat_missing_method == "convert":
+        if cat_missing_method == "convert" and not unseen_missing:
             feature_names.insert(2, "cat_1[(MISSING)]")
             feature_names.append("cat_2[(MISSING)]")
 

From 34ad571f71a63ada9ab83de93dcc78213fc6d817 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 10 Jan 2024 16:13:55 +0100
Subject: [PATCH 02/23] zero not drop

---
 src/glum/_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 7d3c80d63..c4123abb3 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -896,7 +896,7 @@ def _convert_from_pandas(
                     categorical_format=self.categorical_format,
                 )
                 # drop categories that were not seen in training
-                cat_missing_method = "drop"
+                cat_missing_method = "zero"
 
         X = tm.from_pandas(
             df,

From fedd9b1d3b969a3bc952c54d47e05889cb46d05e Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 10 Jan 2024 16:16:18 +0100
Subject: [PATCH 03/23] better (?) name [skip ci]

---
 src/glum/_glm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index c4123abb3..5a325bd81 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -883,7 +883,7 @@ def _convert_from_pandas(
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
-        cat_missing_method = self.cat_missing_method
+        cat_missing_method_after_cat_check = self.cat_missing_method
 
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
@@ -896,13 +896,13 @@ def _convert_from_pandas(
                     categorical_format=self.categorical_format,
                 )
                 # drop categories that were not seen in training
-                cat_missing_method = "zero"
+                cat_missing_method_after_cat_check = "zero"
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
-            cat_missing_method=cat_missing_method,
+            cat_missing_method=cat_missing_method_after_cat_check,
         )
 
         return X

From 6d2b4310b9169dbd91411238ac96eafa8b8aeb1c Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 10 Jan 2024 16:16:18 +0100
Subject: [PATCH 04/23] catch case of unseen missings and fail method

---
 tests/glm/test_glm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index a8d998b30..0bc3c68e2 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3186,7 +3186,7 @@ def test_cat_missing(cat_missing_method, unseen_missing):
     )
 
     if cat_missing_method == "fail" and not unseen_missing:
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match="Categorical data can't have missing values"):
             model.fit(X, y)
     else:
         model.fit(X, y)
@@ -3199,4 +3199,8 @@ def test_cat_missing(cat_missing_method, unseen_missing):
         np.testing.assert_array_equal(model.feature_names_, feature_names)
         assert len(model.coef_) == len(feature_names)
 
-        model.predict(X_unseen)
+        if cat_missing_method == "fail" and unseen_missing:
+            with pytest.raises(ValueError, match="Categorical data can't have missing values"):
+                model.predict(X_unseen)
+        else:
+            model.predict(X_unseen)

From 0aaf521b095a27be52058f225e6780dbf08b487b Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 10 Jan 2024 16:31:30 +0100
Subject: [PATCH 05/23] fix

---
 tests/glm/test_glm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 0bc3c68e2..de4ff1390 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3186,7 +3186,9 @@ def test_cat_missing(cat_missing_method, unseen_missing):
     )
 
     if cat_missing_method == "fail" and not unseen_missing:
-        with pytest.raises(ValueError, match="Categorical data can't have missing values"):
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
             model.fit(X, y)
     else:
         model.fit(X, y)
@@ -3200,7 +3202,9 @@ def test_cat_missing(cat_missing_method, unseen_missing):
         assert len(model.coef_) == len(feature_names)
 
         if cat_missing_method == "fail" and unseen_missing:
-            with pytest.raises(ValueError, match="Categorical data can't have missing values"):
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
                 model.predict(X_unseen)
         else:
             model.predict(X_unseen)

From dcdb326e014e28e1abc9fbe057a14b203096e355 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 09:30:44 +0100
Subject: [PATCH 06/23] respect categorical missing method with formula; test
 different categorical missing methods also with formula

---
 src/glum/_glm.py      |  1 +
 tests/glm/test_glm.py | 55 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 5a325bd81..474b2c945 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2669,6 +2669,7 @@ def _set_up_and_check_fit_args(
                     include_intercept=False,
                     ensure_full_rank=self.drop_first,
                     categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
                     interaction_separator=self.interaction_separator,
                     add_column_for_intercept=False,
                     context=context,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index de4ff1390..46004cd3f 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3166,6 +3166,7 @@ def test_cat_missing(cat_missing_method, unseen_missing):
         {
             "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
             "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
+            "x1": [1, 2, 3, 4, 5],
         }
     )
     if unseen_missing:
@@ -3174,6 +3175,7 @@ def test_cat_missing(cat_missing_method, unseen_missing):
         {
             "cat_1": pd.Categorical([1, pd.NA]),
             "cat_2": pd.Categorical([1, 2]),
+            "x1": [1, 2],
         }
     )
     y = np.array(X.index)
@@ -3208,3 +3210,56 @@ def test_cat_missing(cat_missing_method, unseen_missing):
                 model.predict(X_unseen)
         else:
             model.predict(X_unseen)
+
+
+@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"])
+@pytest.mark.parametrize("unseen_missing", [False, True])
+@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
+def test_cat_missing_formula(cat_missing_method, unseen_missing, formula):
+    X = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
+            "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
+        }
+    )
+    if unseen_missing:
+        X = X.dropna()
+    X_unseen = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, pd.NA]),
+            "cat_2": pd.Categorical([1, 2]),
+        }
+    )
+    y = np.array(X.index)
+
+    model = GeneralizedLinearRegressor(
+        family="normal",
+        cat_missing_method=cat_missing_method,
+        drop_first=False,
+        formula=formula,
+        fit_intercept=False,
+    )
+
+    if cat_missing_method == "fail" and not unseen_missing:
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
+            model.fit(X, y)
+    else:
+        model.fit(X, y)
+        feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
+
+        if cat_missing_method == "convert" and not unseen_missing:
+            feature_names.insert(2, "cat_1[(MISSING)]")
+            feature_names.append("cat_2[(MISSING)]")
+
+        np.testing.assert_array_equal(model.feature_names_, feature_names)
+        assert len(model.coef_) == len(feature_names)
+
+        if cat_missing_method == "fail" and unseen_missing:
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
+                model.predict(X_unseen)
+        else:
+            model.predict(X_unseen)

From 24bfb37e332d76692d028a8e474803552b581a6c Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 09:32:45 +0100
Subject: [PATCH 07/23] shorten the tests

---
 tests/glm/test_glm.py | 55 +------------------------------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 46004cd3f..ffc475f23 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3161,61 +3161,8 @@ def test_formula_predict(get_mixed_data, formula):
 
 @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
 @pytest.mark.parametrize("unseen_missing", [False, True])
-def test_cat_missing(cat_missing_method, unseen_missing):
-    X = pd.DataFrame(
-        {
-            "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
-            "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
-            "x1": [1, 2, 3, 4, 5],
-        }
-    )
-    if unseen_missing:
-        X = X.dropna()
-    X_unseen = pd.DataFrame(
-        {
-            "cat_1": pd.Categorical([1, pd.NA]),
-            "cat_2": pd.Categorical([1, 2]),
-            "x1": [1, 2],
-        }
-    )
-    y = np.array(X.index)
-
-    model = GeneralizedLinearRegressor(
-        family="normal",
-        cat_missing_method=cat_missing_method,
-        drop_first=False,
-        fit_intercept=False,
-    )
-
-    if cat_missing_method == "fail" and not unseen_missing:
-        with pytest.raises(
-            ValueError, match="Categorical data can't have missing values"
-        ):
-            model.fit(X, y)
-    else:
-        model.fit(X, y)
-        feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
-
-        if cat_missing_method == "convert" and not unseen_missing:
-            feature_names.insert(2, "cat_1[(MISSING)]")
-            feature_names.append("cat_2[(MISSING)]")
-
-        np.testing.assert_array_equal(model.feature_names_, feature_names)
-        assert len(model.coef_) == len(feature_names)
-
-        if cat_missing_method == "fail" and unseen_missing:
-            with pytest.raises(
-                ValueError, match="Categorical data can't have missing values"
-            ):
-                model.predict(X_unseen)
-        else:
-            model.predict(X_unseen)
-
-
-@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"])
-@pytest.mark.parametrize("unseen_missing", [False, True])
 @pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
-def test_cat_missing_formula(cat_missing_method, unseen_missing, formula):
+def test_cat_missing(cat_missing_method, unseen_missing, formula):
     X = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),

From ca19b688210a6fbc4f6e30725602853ae9912354 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 10:03:15 +0100
Subject: [PATCH 08/23] dont allow fitting in case of conversion of
 categoricals and presence of formula

---
 src/glum/_glm.py      | 5 +++++
 tests/glm/test_glm.py | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 474b2c945..e5cf3d838 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2642,6 +2642,11 @@ def _set_up_and_check_fit_args(
 
         if isinstance(X, pd.DataFrame):
             if hasattr(self, "formula") and self.formula is not None:
+                if self.cat_missing_method == "convert":
+                    raise NotImplementedError(
+                        "Conversion of missing categoricals with a formula is currently not supported."
+                    )
+
                 lhs, rhs = _parse_formula(
                     self.formula, include_intercept=self.fit_intercept
                 )
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index ffc475f23..e4f9d103b 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3186,7 +3186,12 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         formula=formula,
         fit_intercept=False,
     )
-
+    if cat_missing_method == "convert" and formula:
+        with pytest.raises(
+            NotImplementedError,
+            match="Conversion of missing categoricals with a formula",
+        ):
+            model.fit(X, y)
     if cat_missing_method == "fail" and not unseen_missing:
         with pytest.raises(
             ValueError, match="Categorical data can't have missing values"

From 74a5329846e84972003c4adb5c6b78f6d993617d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 10:12:39 +0100
Subject: [PATCH 09/23] clearer error msg

---
 src/glum/_glm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index e5cf3d838..fe838dafb 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2644,7 +2644,8 @@ def _set_up_and_check_fit_args(
             if hasattr(self, "formula") and self.formula is not None:
                 if self.cat_missing_method == "convert":
                     raise NotImplementedError(
-                        "Conversion of missing categoricals with a formula is currently not supported."
+                        "cat_missing_method == 'convert' with a formula "
+                        "is not allowed."
                     )
 
                 lhs, rhs = _parse_formula(

From e2786049d72977a1ddc5262dacbadee2f3732513 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 10:16:44 +0100
Subject: [PATCH 10/23] also change the error msg in the regex (facepalm)

---
 tests/glm/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index e4f9d103b..22d763be4 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3189,7 +3189,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
     if cat_missing_method == "convert" and formula:
         with pytest.raises(
             NotImplementedError,
-            match="Conversion of missing categoricals with a formula",
+            match="cat_missing_method == 'convert' with a formula is not allowed",
         ):
             model.fit(X, y)
     if cat_missing_method == "fail" and not unseen_missing:

From ab5526c26efac09f5a206b83920415926c96dc5a Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 10:28:20 +0100
Subject: [PATCH 11/23] remove matches

---
 tests/glm/test_glm.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 22d763be4..4c9df4847 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3187,15 +3187,10 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         fit_intercept=False,
     )
     if cat_missing_method == "convert" and formula:
-        with pytest.raises(
-            NotImplementedError,
-            match="cat_missing_method == 'convert' with a formula is not allowed",
-        ):
+        with pytest.raises(NotImplementedError):
             model.fit(X, y)
     if cat_missing_method == "fail" and not unseen_missing:
-        with pytest.raises(
-            ValueError, match="Categorical data can't have missing values"
-        ):
+        with pytest.raises(ValueError):
             model.fit(X, y)
     else:
         model.fit(X, y)
@@ -3209,9 +3204,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         assert len(model.coef_) == len(feature_names)
 
         if cat_missing_method == "fail" and unseen_missing:
-            with pytest.raises(
-                ValueError, match="Categorical data can't have missing values"
-            ):
+            with pytest.raises(ValueError):
                 model.predict(X_unseen)
         else:
             model.predict(X_unseen)

From ca93be8a5efce577d44403046590790a5c5c379c Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 10:38:03 +0100
Subject: [PATCH 12/23] fix

---
 tests/glm/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 4c9df4847..993802493 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3189,7 +3189,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
     if cat_missing_method == "convert" and formula:
         with pytest.raises(NotImplementedError):
             model.fit(X, y)
-    if cat_missing_method == "fail" and not unseen_missing:
+    elif cat_missing_method == "fail" and not unseen_missing:
         with pytest.raises(ValueError):
             model.fit(X, y)
     else:

From 5e75f784f07124723bebfa783e20f63a6001a74f Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 11 Jan 2024 11:40:03 +0100
Subject: [PATCH 13/23] better name

---
 src/glum/_glm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index fe838dafb..8f47bf740 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -883,7 +883,7 @@ def _convert_from_pandas(
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
-        cat_missing_method_after_cat_check = self.cat_missing_method
+        cat_missing_method_after_alignment = self.cat_missing_method
 
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
@@ -896,13 +896,13 @@ def _convert_from_pandas(
                     categorical_format=self.categorical_format,
                 )
                 # drop categories that were not seen in training
-                cat_missing_method_after_cat_check = "zero"
+                cat_missing_method_after_alignment = "zero"
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
-            cat_missing_method=cat_missing_method_after_cat_check,
+            cat_missing_method=cat_missing_method_after_alignment,
         )
 
         return X

From c2d88b23bbace810ce333f45ba0d3e1647da3cfd Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 15 Jan 2024 09:51:48 +0100
Subject: [PATCH 14/23] describe more restrictive behavior in tutorial

---
 .../formula_interface/formula_interface.ipynb | 137 +-----------------
 1 file changed, 2 insertions(+), 135 deletions(-)

diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
index acdf50ea2..d396adc09 100644
--- a/docs/tutorials/formula_interface/formula_interface.ipynb
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -1430,140 +1430,7 @@
    "source": [
     "### Missing Values in Categorical Columns\n",
     "\n",
-    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n",
-    "\n",
-    "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intercept</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[0]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[1]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[2]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[3]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[4]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[5]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[6]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[4]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[5]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[6]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[7]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[8]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[9]</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>coefficient</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.786703</td>\n",
-       "      <td>0.742765</td>\n",
-       "      <td>0.239528</td>\n",
-       "      <td>0.096531</td>\n",
-       "      <td>0.071118</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.201078</td>\n",
-       "      <td>4.637267</td>\n",
-       "      <td>4.679391</td>\n",
-       "      <td>4.863387</td>\n",
-       "      <td>4.77263</td>\n",
-       "      <td>4.749673</td>\n",
-       "      <td>4.970188</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             intercept  C(DrivAge, missing_method='zero')[0]  \\\n",
-       "coefficient        0.0                              1.786703   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[1]  \\\n",
-       "coefficient                              0.742765   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[2]  \\\n",
-       "coefficient                              0.239528   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[3]  \\\n",
-       "coefficient                              0.096531   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[4]  \\\n",
-       "coefficient                              0.071118   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[5]  \\\n",
-       "coefficient                                   0.0   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[6]  \\\n",
-       "coefficient                              0.201078   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[4]  \\\n",
-       "coefficient                                  4.637267   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[5]  \\\n",
-       "coefficient                                  4.679391   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[6]  \\\n",
-       "coefficient                                  4.863387   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[7]  \\\n",
-       "coefficient                                   4.77263   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[8]  \\\n",
-       "coefficient                                  4.749673   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[9]  \n",
-       "coefficient                                  4.970188  "
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n",
-    "\n",
-    "t_glm8 = GeneralizedLinearRegressor(\n",
-    "    family=TweedieDist,\n",
-    "    alpha_search=True,\n",
-    "    l1_ratio=1,\n",
-    "    fit_intercept=False,\n",
-    "    formula=formula_missing,\n",
-    "\n",
-    ")\n",
-    "t_glm8.fit(\n",
-    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
-    ")\n",
-    "\n",
-    "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
-    ").T"
+    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option). The treatment of missings should be set globally by the `cat_missing_method` parameter during model initialization. The `\"convert\"` option is only valid for the interface without a formula."
    ]
   }
  ],
@@ -1583,7 +1450,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.1"
   },
   "orig_nbformat": 4
  },

From fb59cfc02270ccd4aa3c29be5db000141cfe8ce3 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Mon, 22 Jan 2024 23:34:04 +0100
Subject: [PATCH 15/23] Raise error on unseen levels when predicting

---
 src/glum/_util.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/glum/_util.py b/src/glum/_util.py
index 24b08f40d..ce734540c 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -47,6 +47,14 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
             changed_dtypes[column] = df[column].cat.set_categories(
                 dtypes[column].categories
             )
+        else:
+            continue
+
+        unseen_categories = set(df[column].unique()) - set(dtypes[column].categories)
+        if unseen_categories:
+            raise ValueError(
+                f"Column {column} contains unseen categories: {unseen_categories}."
+            )
 
     if changed_dtypes:
         df = df.assign(**changed_dtypes)

From 1618707af41de02103ecb937c6ee13bb3aebdde1 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Mon, 22 Jan 2024 23:45:49 +0100
Subject: [PATCH 16/23] Allow cat_missing_method='convert' again

---
 .../formula_interface/formula_interface.ipynb | 137 +++++++++++++++++-
 src/glum/_glm.py                              |   9 +-
 2 files changed, 137 insertions(+), 9 deletions(-)

diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
index d396adc09..acdf50ea2 100644
--- a/docs/tutorials/formula_interface/formula_interface.ipynb
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -1430,7 +1430,140 @@
    "source": [
     "### Missing Values in Categorical Columns\n",
     "\n",
-    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option). The treatment of missings should be set globally by the `cat_missing_method` parameter during model initialization. The `\"convert\"` option is only valid for the interface without a formula."
+    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n",
+    "\n",
+    "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[0]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[1]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[2]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[3]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[4]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[5]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[6]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[4]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[5]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[6]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[7]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[8]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[9]</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.786703</td>\n",
+       "      <td>0.742765</td>\n",
+       "      <td>0.239528</td>\n",
+       "      <td>0.096531</td>\n",
+       "      <td>0.071118</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.201078</td>\n",
+       "      <td>4.637267</td>\n",
+       "      <td>4.679391</td>\n",
+       "      <td>4.863387</td>\n",
+       "      <td>4.77263</td>\n",
+       "      <td>4.749673</td>\n",
+       "      <td>4.970188</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(DrivAge, missing_method='zero')[0]  \\\n",
+       "coefficient        0.0                              1.786703   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[1]  \\\n",
+       "coefficient                              0.742765   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[2]  \\\n",
+       "coefficient                              0.239528   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[3]  \\\n",
+       "coefficient                              0.096531   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[4]  \\\n",
+       "coefficient                              0.071118   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[5]  \\\n",
+       "coefficient                                   0.0   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[6]  \\\n",
+       "coefficient                              0.201078   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[4]  \\\n",
+       "coefficient                                  4.637267   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[5]  \\\n",
+       "coefficient                                  4.679391   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[6]  \\\n",
+       "coefficient                                  4.863387   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[7]  \\\n",
+       "coefficient                                   4.77263   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[8]  \\\n",
+       "coefficient                                  4.749673   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[9]  \n",
+       "coefficient                                  4.970188  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n",
+    "\n",
+    "t_glm8 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=False,\n",
+    "    formula=formula_missing,\n",
+    "\n",
+    ")\n",
+    "t_glm8.fit(\n",
+    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
+    ")\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
+    ").T"
    ]
   }
  ],
@@ -1450,7 +1583,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.11.4"
   },
   "orig_nbformat": 4
  },
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 53172bb50..62afc68d6 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -894,8 +894,8 @@ def _convert_from_pandas(
                     cat_missing_name=self.cat_missing_name,
                     categorical_format=self.categorical_format,
                 )
-                # drop categories that were not seen in training
-                cat_missing_method_after_alignment = "zero"
+                # there should be no missing categories after this
+                cat_missing_method_after_alignment = "fail"
 
         X = tm.from_pandas(
             df,
@@ -2650,11 +2650,6 @@ def _set_up_and_check_fit_args(
 
         if isinstance(X, pd.DataFrame):
             if hasattr(self, "formula") and self.formula is not None:
-                if self.cat_missing_method == "convert":
-                    raise NotImplementedError(
-                        "cat_missing_method == 'convert' with a formula "
-                        "is not allowed."
-                    )
 
                 lhs, rhs = _parse_formula(
                     self.formula, include_intercept=self.fit_intercept

From c448f3dc6b3b7ce814224225ee746caf9e27c234 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Tue, 23 Jan 2024 02:52:11 +0100
Subject: [PATCH 17/23] Update test

---
 tests/glm/test_glm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index b19eff744..0829722ed 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3209,11 +3209,8 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         formula=formula,
         fit_intercept=False,
     )
-    if cat_missing_method == "convert" and formula:
-        with pytest.raises(NotImplementedError):
-            model.fit(X, y)
-    elif cat_missing_method == "fail" and not unseen_missing:
-        with pytest.raises(ValueError):
+    if cat_missing_method == "fail" and not unseen_missing:
+        with pytest.raises(ValueError, match="Categorical data can't have missing values"):
             model.fit(X, y)
     else:
         model.fit(X, y)
@@ -3227,7 +3224,10 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         assert len(model.coef_) == len(feature_names)
 
         if cat_missing_method == "fail" and unseen_missing:
-            with pytest.raises(ValueError):
+            with pytest.raises(ValueError, match="Categorical data can't have missing values"):
+                model.predict(X_unseen)
+        elif cat_missing_method == "convert" and unseen_missing:
+            with pytest.raises(ValueError, match="contains unseen categories"):
                 model.predict(X_unseen)
         else:
             model.predict(X_unseen)

From 046d9ff9ad5558409fa02021269731cec1e6f8c4 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Tue, 23 Jan 2024 03:00:13 +0100
Subject: [PATCH 18/23] Check for unseen categories

---
 src/glum/_glm.py  | 12 ++++++++++--
 src/glum/_util.py | 16 ++++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 62afc68d6..43f6516ef 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -885,7 +885,12 @@ def _convert_from_pandas(
         cat_missing_method_after_alignment = self.cat_missing_method
 
         if hasattr(self, "feature_dtypes_"):
-            df = _align_df_categories(df, self.feature_dtypes_)
+            df = _align_df_categories(
+                df,
+                self.feature_dtypes_,
+                self.has_missing_category_,
+                self.cat_missing_method,
+            )
             if self.cat_missing_method == "convert":
                 df = _add_missing_categories(
                     df=df,
@@ -2650,7 +2655,6 @@ def _set_up_and_check_fit_args(
 
         if isinstance(X, pd.DataFrame):
             if hasattr(self, "formula") and self.formula is not None:
-
                 lhs, rhs = _parse_formula(
                     self.formula, include_intercept=self.fit_intercept
                 )
@@ -2705,6 +2709,10 @@ def _set_up_and_check_fit_args(
                 # Maybe TODO: expand categorical penalties with formulas
 
                 self.feature_dtypes_ = X.dtypes.to_dict()
+                self.has_missing_category_ = {
+                    col: (self.cat_missing_method == "convert") and X[col].isna().any()
+                    for col in self.feature_dtypes_.keys()
+                }
 
                 if any(X.dtypes == "category"):
 
diff --git a/src/glum/_util.py b/src/glum/_util.py
index ce734540c..f5c463ff3 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs):
     return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)
 
 
-def _align_df_categories(df, dtypes) -> pd.DataFrame:
+def _align_df_categories(
+    df, dtypes, has_missing_category, cat_missing_method
+) -> pd.DataFrame:
     """Align data types for prediction.
 
     This function checks that categorical columns have same categories in the
@@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     ----------
     df : pandas.DataFrame
     dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
+    has_missing_category : Dict[str, bool]
+    missing_method : str
     """
     if not isinstance(df, pd.DataFrame):
         raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
@@ -50,7 +54,15 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
         else:
             continue
 
-        unseen_categories = set(df[column].unique()) - set(dtypes[column].categories)
+        if cat_missing_method == "convert" and not has_missing_category[column]:
+            unseen_categories = set(df[column].unique()) - set(
+                dtypes[column].categories
+            )
+        else:
+            unseen_categories = set(df[column].dropna().unique()) - set(
+                dtypes[column].categories
+            )
+
         if unseen_categories:
             raise ValueError(
                 f"Column {column} contains unseen categories: {unseen_categories}."

From 39ce302ed982f504c6ba9d68b06448850a07bcf8 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Tue, 23 Jan 2024 03:32:30 +0100
Subject: [PATCH 19/23] Adapt align_df_categories tests to changes

---
 tests/glm/test_utils.py | 88 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 9 deletions(-)

diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py
index 36cf988a3..614717502 100644
--- a/tests/glm/test_utils.py
+++ b/tests/glm/test_utils.py
@@ -16,12 +16,15 @@ def df():
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
 
 def test_align_df_categories_numeric(df):
     dtypes = {column: np.float64 for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_categorical(df):
+    df = df[["x5", "x6", "x7", "x8"]]
     dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
-            "x1": [np.nan, np.nan],
-            "x2": [np.nan, np.nan],
-            "x3": [np.nan, np.nan],
-            "x4": [np.nan, np.nan],
             "x5": pd.Categorical(["a", "b"]),
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         },
         dtype=pd.CategoricalDtype(["a", "b"]),
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method),
+        expected,
+    )
 
 
 def test_align_df_categories_excess_columns(df):
     dtypes = {"x1": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_missing_columns(df):
     dtypes = {"x0": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -87,15 +103,69 @@ def test_align_df_categories_missing_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
+
+
+@pytest.mark.parametrize("has_missings", [False, True])
+def test_align_df_categories_convert(df, has_missings):
+    df = df[["x5", "x6", "x7", "x8"]]
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: has_missings for column in df}
+    missing_method = "convert"
+
+    expected = pd.DataFrame(
+        {
+            "x5": pd.Categorical(["a", "b"]),
+            "x6": pd.Categorical(["a", "b"]),
+            "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
+        },
+        dtype=pd.CategoricalDtype(["a", "b"]),
+    )
+
+    if has_missings:
+        pd.testing.assert_frame_equal(
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            ),
+            expected,
+        )
+    else:
+        with pytest.raises(ValueError, match="contains unseen categories"):
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            )
+
+
+def test_align_df_categories_raise_on_unseen(df):
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
+
+    with pytest.raises(ValueError, match="contains unseen categories"):
+        _align_df_categories(
+            df,
+            dtypes,
+            has_missing_category,
+            missing_method,
+        )
 
 
 def test_align_df_categories_not_df():
     with pytest.raises(TypeError):
-        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
+        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail")
 
 
 @pytest.fixture()

From 099f362fd3feaf9d208cc62073dfcd18f3f5f058 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Tue, 23 Jan 2024 04:11:38 +0100
Subject: [PATCH 20/23] Make pre-commit happy

---
 tests/glm/test_glm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 0829722ed..e9d2bb3a7 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3210,7 +3210,9 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         fit_intercept=False,
     )
     if cat_missing_method == "fail" and not unseen_missing:
-        with pytest.raises(ValueError, match="Categorical data can't have missing values"):
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
             model.fit(X, y)
     else:
         model.fit(X, y)
@@ -3224,7 +3226,9 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         assert len(model.coef_) == len(feature_names)
 
         if cat_missing_method == "fail" and unseen_missing:
-            with pytest.raises(ValueError, match="Categorical data can't have missing values"):
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
                 model.predict(X_unseen)
         elif cat_missing_method == "convert" and unseen_missing:
             with pytest.raises(ValueError, match="contains unseen categories"):

From 056bf6851f64427735c9e1e1d698d3c772149184 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Thu, 25 Jan 2024 14:27:18 +0100
Subject: [PATCH 21/23] Avoid unnecessary work

---
 src/glum/_glm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 43f6516ef..33afb37ff 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2711,7 +2711,8 @@ def _set_up_and_check_fit_args(
                 self.feature_dtypes_ = X.dtypes.to_dict()
                 self.has_missing_category_ = {
                     col: (self.cat_missing_method == "convert") and X[col].isna().any()
-                    for col in self.feature_dtypes_.keys()
+                    for col, dtype in self.feature_dtypes_.items()
+                    if isinstance(dtype, pd.CategoricalDtype)
                 }
 
                 if any(X.dtypes == "category"):

From 0b666ee330d3716c641223528366878881018df0 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Mon, 29 Jan 2024 14:35:44 +0100
Subject: [PATCH 22/23] Correctly expand penalties with categoricals and
 `cat_missing_method="convert"` (#753)

* Correctyl expand penalties when cat_missing_method=convert

* Add test

* Improve variable names

Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>

---------

Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
---
 src/glum/_glm.py      | 22 +++++++++++++++++-----
 tests/glm/test_glm.py | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 33afb37ff..ca26e4e6a 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2717,7 +2717,9 @@ def _set_up_and_check_fit_args(
 
                 if any(X.dtypes == "category"):
 
-                    def _expand_categorical_penalties(penalty, X, drop_first):
+                    def _expand_categorical_penalties(
+                        penalty, X, drop_first, has_missing_category
+                    ):
                         """
                         If P1 or P2 has the same shape as X before expanding the
                         categoricals, we assume that the penalty at the location of
@@ -2741,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                                     chain.from_iterable(
                                         [
                                             elmt
-                                            for _ in dtype.categories[int(drop_first) :]
+                                            for _ in range(
+                                                len(dtype.categories)
+                                                + has_missing_category[col]
+                                                - drop_first
+                                            )
                                         ]
                                         if pd.api.types.is_categorical_dtype(dtype)
                                         else [elmt]
-                                        for elmt, dtype in zip(penalty, X.dtypes)
+                                        for elmt, (col, dtype) in zip(
+                                            penalty, X.dtypes.items()
+                                        )
                                     )
                                 )
                             )
                         else:
                             return penalty
 
-                    P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
-                    P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
+                    P1 = _expand_categorical_penalties(
+                        self.P1, X, self.drop_first, self.has_missing_category_
+                    )
+                    P2 = _expand_categorical_penalties(
+                        self.P2, X, self.drop_first, self.has_missing_category_
+                    )
 
                 X = tm.from_pandas(
                     X,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index e9d2bb3a7..469f464e9 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -53,7 +53,7 @@
 
 
 def get_small_x_y(
-    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
+    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
 ) -> tuple[np.ndarray, np.ndarray]:
     if isinstance(estimator, GeneralizedLinearRegressor):
         n_rows = 1
@@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
 
+def test_P1_P2_expansion_with_categoricals_missings():
+    rng = np.random.default_rng(42)
+    X = pd.DataFrame(
+        data={
+            "dense": np.linspace(0, 10, 60),
+            "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
+        }
+    )
+    y = rng.normal(size=60)
+
+    mdl1 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2, 2, 2, 2, 2],
+        P2=[2, 1, 1, 1, 1, 1],
+        cat_missing_method="convert",
+    )
+    mdl1.fit(X, y)
+
+    mdl2 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=[2, 1],
+        cat_missing_method="convert",
+    )
+    mdl2.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+
+    mdl3 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=sparse.diags([2, 1, 1, 1, 1, 1]),
+        cat_missing_method="convert",
+    )
+    mdl3.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)
+
+
 @pytest.mark.parametrize(
     "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
 )

From 2fcbc9b53403fc9c669b4f14d69aeb54af9ea01d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 29 Jan 2024 14:42:19 +0100
Subject: [PATCH 23/23] bump tabmat pre-release version

---
 conda.recipe/meta.yaml | 2 +-
 environment.yml        | 2 +-
 setup.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 938db6d96..35218f7c4 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - scikit-learn >=0.23
     - scipy
     - formulaic >=0.6
-    - tabmat >=4.0.0a
+    - tabmat >=4.0.0a3
 
 test:
   requires:
diff --git a/environment.yml b/environment.yml
index f621d424c..d0d7d1725 100644
--- a/environment.yml
+++ b/environment.yml
@@ -9,7 +9,7 @@ dependencies:
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
   - pandas>=0.21
-  - tabmat>=4.0.0a
+  - tabmat>=4.0.0a3
   - scikit-learn>=0.23
   - scipy
   - tqdm
diff --git a/setup.py b/setup.py
index cf21ad773..515c68c26 100644
--- a/setup.py
+++ b/setup.py
@@ -87,7 +87,7 @@
         "scikit-learn>=0.23",
         "scipy",
         "formulaic>=0.6",
-        "tabmat>=4.0.0a",
+        "tabmat>=4.0.0a3",
     ],
     entry_points=None
     if os.environ.get("CONDA_BUILD")