Quantco · MatthiasSchmidtblaicherQC · Jan 29, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -879,12 +879,18 @@ def _convert_from_pandas(
         self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
     ) -> tm.MatrixBase:
         """Convert a pandas data frame to a tabmat matrix."""
-
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
+        cat_missing_method_after_alignment = self.cat_missing_method
+
         if hasattr(self, "feature_dtypes_"):
-            df = _align_df_categories(df, self.feature_dtypes_)
+            df = _align_df_categories(
+                df,
+                self.feature_dtypes_,
+                self.has_missing_category_,
+                self.cat_missing_method,
+            )
             if self.cat_missing_method == "convert":
                 df = _add_missing_categories(
                     df=df,
@@ -893,12 +899,14 @@ def _convert_from_pandas(
                     cat_missing_name=self.cat_missing_name,
                     categorical_format=self.categorical_format,
                 )
+                # there should be no missing categories after this
+                cat_missing_method_after_alignment = "fail"
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
-            cat_missing_method=self.cat_missing_method,
+            cat_missing_method=cat_missing_method_after_alignment,
         )
 
         return X
@@ -2674,6 +2682,7 @@ def _set_up_and_check_fit_args(
                     include_intercept=False,
                     ensure_full_rank=self.drop_first,
                     categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
                     interaction_separator=self.interaction_separator,
                     add_column_for_intercept=False,
                     context=context,
@@ -2700,6 +2709,10 @@ def _set_up_and_check_fit_args(
                 # Maybe TODO: expand categorical penalties with formulas
 
                 self.feature_dtypes_ = X.dtypes.to_dict()
+                self.has_missing_category_ = {
+                    col: (self.cat_missing_method == "convert") and X[col].isna().any()
+                    for col in self.feature_dtypes_.keys()
+                }
 
                 if any(X.dtypes == "category"):
 

diff --git a/src/glum/_util.py b/src/glum/_util.py
@@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs):
     return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)
 
 
-def _align_df_categories(df, dtypes) -> pd.DataFrame:
+def _align_df_categories(
+    df, dtypes, has_missing_category, cat_missing_method
+) -> pd.DataFrame:
     """Align data types for prediction.
 
     This function checks that categorical columns have same categories in the
@@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     ----------
     df : pandas.DataFrame
     dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
+    has_missing_category : Dict[str, bool]
+    missing_method : str
     """
     if not isinstance(df, pd.DataFrame):
         raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
@@ -47,6 +51,22 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
             changed_dtypes[column] = df[column].cat.set_categories(
                 dtypes[column].categories
             )
+        else:
+            continue
+
+        if cat_missing_method == "convert" and not has_missing_category[column]:
+            unseen_categories = set(df[column].unique()) - set(
+                dtypes[column].categories
+            )
+        else:
+            unseen_categories = set(df[column].dropna().unique()) - set(
+                dtypes[column].categories
+            )
+
+        if unseen_categories:
+            raise ValueError(
+                f"Column {column} contains unseen categories: {unseen_categories}."
+            )
 
     if changed_dtypes:
         df = df.assign(**changed_dtypes)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
@@ -3183,40 +3183,55 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):
 
 
 @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
-def test_cat_missing(cat_missing_method):
+@pytest.mark.parametrize("unseen_missing", [False, True])
+@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
+def test_cat_missing(cat_missing_method, unseen_missing, formula):
     X = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
             "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
         }
     )
+    if unseen_missing:
+        X = X.dropna()
     X_unseen = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, pd.NA]),
             "cat_2": pd.Categorical([1, 2]),
         }
     )
-    y = np.array([1, 2, 3, 4, 5])
+    y = np.array(X.index)
 
     model = GeneralizedLinearRegressor(
         family="normal",
         cat_missing_method=cat_missing_method,
         drop_first=False,
+        formula=formula,
         fit_intercept=False,
     )
-
-    if cat_missing_method == "fail":
-        with pytest.raises(ValueError):
+    if cat_missing_method == "fail" and not unseen_missing:
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
             model.fit(X, y)
     else:
         model.fit(X, y)
         feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
 
-        if cat_missing_method == "convert":
+        if cat_missing_method == "convert" and not unseen_missing:
             feature_names.insert(2, "cat_1[(MISSING)]")
             feature_names.append("cat_2[(MISSING)]")
 
         np.testing.assert_array_equal(model.feature_names_, feature_names)
         assert len(model.coef_) == len(feature_names)
 
-        model.predict(X_unseen)
+        if cat_missing_method == "fail" and unseen_missing:
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
+                model.predict(X_unseen)
+        elif cat_missing_method == "convert" and unseen_missing:
+            with pytest.raises(ValueError, match="contains unseen categories"):
+                model.predict(X_unseen)
+        else:
+            model.predict(X_unseen)
diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py
@@ -16,12 +16,15 @@ def df():
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
 
 def test_align_df_categories_numeric(df):
     dtypes = {column: np.float64 for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_categorical(df):
+    df = df[["x5", "x6", "x7", "x8"]]
     dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
-            "x1": [np.nan, np.nan],
-            "x2": [np.nan, np.nan],
-            "x3": [np.nan, np.nan],
-            "x4": [np.nan, np.nan],
             "x5": pd.Categorical(["a", "b"]),
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         },
         dtype=pd.CategoricalDtype(["a", "b"]),
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method),
+        expected,
+    )
 
 
 def test_align_df_categories_excess_columns(df):
     dtypes = {"x1": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_missing_columns(df):
     dtypes = {"x0": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -87,15 +103,69 @@ def test_align_df_categories_missing_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
+
+
+@pytest.mark.parametrize("has_missings", [False, True])
+def test_align_df_categories_convert(df, has_missings):
+    df = df[["x5", "x6", "x7", "x8"]]
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: has_missings for column in df}
+    missing_method = "convert"
+
+    expected = pd.DataFrame(
+        {
+            "x5": pd.Categorical(["a", "b"]),
+            "x6": pd.Categorical(["a", "b"]),
+            "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
+        },
+        dtype=pd.CategoricalDtype(["a", "b"]),
+    )
+
+    if has_missings:
+        pd.testing.assert_frame_equal(
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            ),
+            expected,
+        )
+    else:
+        with pytest.raises(ValueError, match="contains unseen categories"):
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            )
+
+
+def test_align_df_categories_raise_on_unseen(df):
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
+
+    with pytest.raises(ValueError, match="contains unseen categories"):
+        _align_df_categories(
+            df,
+            dtypes,
+            has_missing_category,
+            missing_method,
+        )
 
 
 def test_align_df_categories_not_df():
     with pytest.raises(TypeError):
-        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
+        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail")
 
 
 @pytest.fixture()