Informative error when encountering categories that were not seen in …

…training (#748) * drop missings not seen in training * zero not drop * better (?) name [skip ci] * catch case of unseen missings and fail method * fix * respect categorical missing method with formula; test different categorical missing methods also with formula * shorten the tests * dont allow fitting in case of conversion of categoricals and presence of formula * clearer error msg * also change the error msg in the regex (facepalm) * remove matches * fix * better name * describe more restrictive behavior in tutorial * Raise error on unseen levels when predicting * Allow cat_missing_method='convert' again * Update test * Check for unseen categories * Adapt align_df_categories tests to changes * Make pre-commit happy * Avoid unnecessary work * Correctly expand penalties with categoricals and `cat_missing_method="convert"` (#753) * Correctyl expand penalties when cat_missing_method=convert * Add test * Improve variable names Co-authored-by: Matthias Schmidtblaicher <[email protected]> --------- Co-authored-by: Matthias Schmidtblaicher <[email protected]> * bump tabmat pre-release version --------- Co-authored-by: Martin Stancsics <[email protected]>
Quantco · Jan 29, 2024 · 1ad8be2 · 1ad8be2
1 parent 6b2b844
commit 1ad8be2
Show file tree

Hide file tree

Showing 7 changed files with 197 additions and 29 deletions.
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - scikit-learn >=0.23
     - scipy
     - formulaic >=0.6
-    - tabmat >=4.0.0a
+    - tabmat >=4.0.0a3
 
 test:
   requires:

diff --git a/environment.yml b/environment.yml
@@ -9,7 +9,7 @@ dependencies:
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
   - pandas>=0.21
-  - tabmat>=4.0.0a
+  - tabmat>=4.0.0a3
   - scikit-learn>=0.23
   - scipy
   - tqdm

diff --git a/setup.py b/setup.py
@@ -87,7 +87,7 @@
         "scikit-learn>=0.23",
         "scipy",
         "formulaic>=0.6",
-        "tabmat>=4.0.0a",
+        "tabmat>=4.0.0a3",
     ],
     entry_points=None
     if os.environ.get("CONDA_BUILD")

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -879,12 +879,18 @@ def _convert_from_pandas(
         self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
     ) -> tm.MatrixBase:
         """Convert a pandas data frame to a tabmat matrix."""
-
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
+        cat_missing_method_after_alignment = self.cat_missing_method
+
         if hasattr(self, "feature_dtypes_"):
-            df = _align_df_categories(df, self.feature_dtypes_)
+            df = _align_df_categories(
+                df,
+                self.feature_dtypes_,
+                self.has_missing_category_,
+                self.cat_missing_method,
+            )
             if self.cat_missing_method == "convert":
                 df = _add_missing_categories(
                     df=df,
@@ -893,12 +899,14 @@ def _convert_from_pandas(
                     cat_missing_name=self.cat_missing_name,
                     categorical_format=self.categorical_format,
                 )
+                # there should be no missing categories after this
+                cat_missing_method_after_alignment = "fail"
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
-            cat_missing_method=self.cat_missing_method,
+            cat_missing_method=cat_missing_method_after_alignment,
         )
 
         return X
@@ -2674,6 +2682,7 @@ def _set_up_and_check_fit_args(
                     include_intercept=False,
                     ensure_full_rank=self.drop_first,
                     categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
                     interaction_separator=self.interaction_separator,
                     add_column_for_intercept=False,
                     context=context,
@@ -2700,10 +2709,17 @@ def _set_up_and_check_fit_args(
                 # Maybe TODO: expand categorical penalties with formulas
 
                 self.feature_dtypes_ = X.dtypes.to_dict()
+                self.has_missing_category_ = {
+                    col: (self.cat_missing_method == "convert") and X[col].isna().any()
+                    for col, dtype in self.feature_dtypes_.items()
+                    if isinstance(dtype, pd.CategoricalDtype)
+                }
 
                 if any(X.dtypes == "category"):
 
-                    def _expand_categorical_penalties(penalty, X, drop_first):
+                    def _expand_categorical_penalties(
+                        penalty, X, drop_first, has_missing_category
+                    ):
                         """
                         If P1 or P2 has the same shape as X before expanding the
                         categoricals, we assume that the penalty at the location of
@@ -2727,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                                     chain.from_iterable(
                                         [
                                             elmt
-                                            for _ in dtype.categories[int(drop_first) :]
+                                            for _ in range(
+                                                len(dtype.categories)
+                                                + has_missing_category[col]
+                                                - drop_first
+                                            )
                                         ]
                                         if pd.api.types.is_categorical_dtype(dtype)
                                         else [elmt]
-                                        for elmt, dtype in zip(penalty, X.dtypes)
+                                        for elmt, (col, dtype) in zip(
+                                            penalty, X.dtypes.items()
+                                        )
                                     )
                                 )
                             )
                         else:
                             return penalty
 
-                    P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
-                    P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
+                    P1 = _expand_categorical_penalties(
+                        self.P1, X, self.drop_first, self.has_missing_category_
+                    )
+                    P2 = _expand_categorical_penalties(
+                        self.P2, X, self.drop_first, self.has_missing_category_
+                    )
 
                 X = tm.from_pandas(
                     X,

diff --git a/src/glum/_util.py b/src/glum/_util.py
@@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs):
     return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)
 
 
-def _align_df_categories(df, dtypes) -> pd.DataFrame:
+def _align_df_categories(
+    df, dtypes, has_missing_category, cat_missing_method
+) -> pd.DataFrame:
     """Align data types for prediction.
 
     This function checks that categorical columns have same categories in the
@@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     ----------
     df : pandas.DataFrame
     dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
+    has_missing_category : Dict[str, bool]
+    missing_method : str
     """
     if not isinstance(df, pd.DataFrame):
         raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
@@ -47,6 +51,22 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
             changed_dtypes[column] = df[column].cat.set_categories(
                 dtypes[column].categories
             )
+        else:
+            continue
+
+        if cat_missing_method == "convert" and not has_missing_category[column]:
+            unseen_categories = set(df[column].unique()) - set(
+                dtypes[column].categories
+            )
+        else:
+            unseen_categories = set(df[column].dropna().unique()) - set(
+                dtypes[column].categories
+            )
+
+        if unseen_categories:
+            raise ValueError(
+                f"Column {column} contains unseen categories: {unseen_categories}."
+            )
 
     if changed_dtypes:
         df = df.assign(**changed_dtypes)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
@@ -53,7 +53,7 @@
 
 
 def get_small_x_y(
-    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
+    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
 ) -> tuple[np.ndarray, np.ndarray]:
     if isinstance(estimator, GeneralizedLinearRegressor):
         n_rows = 1
@@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
 
+def test_P1_P2_expansion_with_categoricals_missings():
+    rng = np.random.default_rng(42)
+    X = pd.DataFrame(
+        data={
+            "dense": np.linspace(0, 10, 60),
+            "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
+        }
+    )
+    y = rng.normal(size=60)
+
+    mdl1 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2, 2, 2, 2, 2],
+        P2=[2, 1, 1, 1, 1, 1],
+        cat_missing_method="convert",
+    )
+    mdl1.fit(X, y)
+
+    mdl2 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=[2, 1],
+        cat_missing_method="convert",
+    )
+    mdl2.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+
+    mdl3 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=sparse.diags([2, 1, 1, 1, 1, 1]),
+        cat_missing_method="convert",
+    )
+    mdl3.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)
+
+
 @pytest.mark.parametrize(
     "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
 )
@@ -3183,40 +3220,55 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):
 
 
 @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
-def test_cat_missing(cat_missing_method):
+@pytest.mark.parametrize("unseen_missing", [False, True])
+@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
+def test_cat_missing(cat_missing_method, unseen_missing, formula):
     X = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
             "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
         }
     )
+    if unseen_missing:
+        X = X.dropna()
     X_unseen = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, pd.NA]),
             "cat_2": pd.Categorical([1, 2]),
         }
     )
-    y = np.array([1, 2, 3, 4, 5])
+    y = np.array(X.index)
 
     model = GeneralizedLinearRegressor(
         family="normal",
         cat_missing_method=cat_missing_method,
         drop_first=False,
+        formula=formula,
         fit_intercept=False,
     )
-
-    if cat_missing_method == "fail":
-        with pytest.raises(ValueError):
+    if cat_missing_method == "fail" and not unseen_missing:
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
             model.fit(X, y)
     else:
         model.fit(X, y)
         feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
 
-        if cat_missing_method == "convert":
+        if cat_missing_method == "convert" and not unseen_missing:
             feature_names.insert(2, "cat_1[(MISSING)]")
             feature_names.append("cat_2[(MISSING)]")
 
         np.testing.assert_array_equal(model.feature_names_, feature_names)
         assert len(model.coef_) == len(feature_names)
 
-        model.predict(X_unseen)
+        if cat_missing_method == "fail" and unseen_missing:
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
+                model.predict(X_unseen)
+        elif cat_missing_method == "convert" and unseen_missing:
+            with pytest.raises(ValueError, match="contains unseen categories"):
+                model.predict(X_unseen)
+        else:
+            model.predict(X_unseen)