diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 938db6d9..35218f7c 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -36,7 +36,7 @@ requirements: - scikit-learn >=0.23 - scipy - formulaic >=0.6 - - tabmat >=4.0.0a + - tabmat >=4.0.0a3 test: requires: diff --git a/environment.yml b/environment.yml index f621d424..d0d7d172 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: - libblas>=0=*mkl # comment this line out for macOS arm64 - numexpr - pandas>=0.21 - - tabmat>=4.0.0a + - tabmat>=4.0.0a3 - scikit-learn>=0.23 - scipy - tqdm diff --git a/setup.py b/setup.py index cf21ad77..515c68c2 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,7 @@ "scikit-learn>=0.23", "scipy", "formulaic>=0.6", - "tabmat>=4.0.0a", + "tabmat>=4.0.0a3", ], entry_points=None if os.environ.get("CONDA_BUILD") diff --git a/src/glum/_glm.py b/src/glum/_glm.py index d5134d8b..ca26e4e6 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -879,12 +879,18 @@ def _convert_from_pandas( self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None ) -> tm.MatrixBase: """Convert a pandas data frame to a tabmat matrix.""" - if hasattr(self, "X_model_spec_"): return self.X_model_spec_.get_model_matrix(df, context=context) + cat_missing_method_after_alignment = self.cat_missing_method + if hasattr(self, "feature_dtypes_"): - df = _align_df_categories(df, self.feature_dtypes_) + df = _align_df_categories( + df, + self.feature_dtypes_, + self.has_missing_category_, + self.cat_missing_method, + ) if self.cat_missing_method == "convert": df = _add_missing_categories( df=df, @@ -893,12 +899,14 @@ def _convert_from_pandas( cat_missing_name=self.cat_missing_name, categorical_format=self.categorical_format, ) + # there should be no missing categories after this + cat_missing_method_after_alignment = "fail" X = tm.from_pandas( df, drop_first=self.drop_first, categorical_format=self.categorical_format, - cat_missing_method=self.cat_missing_method, + cat_missing_method=cat_missing_method_after_alignment, ) return X @@ -2674,6 +2682,7 @@ def _set_up_and_check_fit_args( include_intercept=False, ensure_full_rank=self.drop_first, categorical_format=self.categorical_format, + cat_missing_method=self.cat_missing_method, interaction_separator=self.interaction_separator, add_column_for_intercept=False, context=context, @@ -2700,10 +2709,17 @@ def _set_up_and_check_fit_args( # Maybe TODO: expand categorical penalties with formulas self.feature_dtypes_ = X.dtypes.to_dict() + self.has_missing_category_ = { + col: (self.cat_missing_method == "convert") and X[col].isna().any() + for col, dtype in self.feature_dtypes_.items() + if isinstance(dtype, pd.CategoricalDtype) + } if any(X.dtypes == "category"): - def _expand_categorical_penalties(penalty, X, drop_first): + def _expand_categorical_penalties( + penalty, X, drop_first, has_missing_category + ): """ If P1 or P2 has the same shape as X before expanding the categoricals, we assume that the penalty at the location of @@ -2727,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first): chain.from_iterable( [ elmt - for _ in dtype.categories[int(drop_first) :] + for _ in range( + len(dtype.categories) + + has_missing_category[col] + - drop_first + ) ] if pd.api.types.is_categorical_dtype(dtype) else [elmt] - for elmt, dtype in zip(penalty, X.dtypes) + for elmt, (col, dtype) in zip( + penalty, X.dtypes.items() + ) ) ) ) else: return penalty - P1 = _expand_categorical_penalties(self.P1, X, self.drop_first) - P2 = _expand_categorical_penalties(self.P2, X, self.drop_first) + P1 = _expand_categorical_penalties( + self.P1, X, self.drop_first, self.has_missing_category_ + ) + P2 = _expand_categorical_penalties( + self.P2, X, self.drop_first, self.has_missing_category_ + ) X = tm.from_pandas( X, diff --git a/src/glum/_util.py b/src/glum/_util.py index 24b08f40..f5c463ff 100644 --- a/src/glum/_util.py +++ b/src/glum/_util.py @@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs): return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs) -def _align_df_categories(df, dtypes) -> pd.DataFrame: +def _align_df_categories( + df, dtypes, has_missing_category, cat_missing_method +) -> pd.DataFrame: """Align data types for prediction. This function checks that categorical columns have same categories in the @@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame: ---------- df : pandas.DataFrame dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]] + has_missing_category : Dict[str, bool] + missing_method : str """ if not isinstance(df, pd.DataFrame): raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.") @@ -47,6 +51,22 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame: changed_dtypes[column] = df[column].cat.set_categories( dtypes[column].categories ) + else: + continue + + if cat_missing_method == "convert" and not has_missing_category[column]: + unseen_categories = set(df[column].unique()) - set( + dtypes[column].categories + ) + else: + unseen_categories = set(df[column].dropna().unique()) - set( + dtypes[column].categories + ) + + if unseen_categories: + raise ValueError( + f"Column {column} contains unseen categories: {unseen_categories}." + ) if changed_dtypes: df = df.assign(**changed_dtypes) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 4f83ffe7..469f464e 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -53,7 +53,7 @@ def get_small_x_y( - estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] + estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV], ) -> tuple[np.ndarray, np.ndarray]: if isinstance(estimator, GeneralizedLinearRegressor): n_rows = 1 @@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals(): np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) +def test_P1_P2_expansion_with_categoricals_missings(): + rng = np.random.default_rng(42) + X = pd.DataFrame( + data={ + "dense": np.linspace(0, 10, 60), + "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0), + } + ) + y = rng.normal(size=60) + + mdl1 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2, 2, 2, 2, 2], + P2=[2, 1, 1, 1, 1, 1], + cat_missing_method="convert", + ) + mdl1.fit(X, y) + + mdl2 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=[2, 1], + cat_missing_method="convert", + ) + mdl2.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) + + mdl3 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=sparse.diags([2, 1, 1, 1, 1, 1]), + cat_missing_method="convert", + ) + mdl3.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl3.coef_) + + @pytest.mark.parametrize( "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] ) @@ -3183,40 +3220,55 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept): @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) -def test_cat_missing(cat_missing_method): +@pytest.mark.parametrize("unseen_missing", [False, True]) +@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"]) +def test_cat_missing(cat_missing_method, unseen_missing, formula): X = pd.DataFrame( { "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]), "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]), } ) + if unseen_missing: + X = X.dropna() X_unseen = pd.DataFrame( { "cat_1": pd.Categorical([1, pd.NA]), "cat_2": pd.Categorical([1, 2]), } ) - y = np.array([1, 2, 3, 4, 5]) + y = np.array(X.index) model = GeneralizedLinearRegressor( family="normal", cat_missing_method=cat_missing_method, drop_first=False, + formula=formula, fit_intercept=False, ) - - if cat_missing_method == "fail": - with pytest.raises(ValueError): + if cat_missing_method == "fail" and not unseen_missing: + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): model.fit(X, y) else: model.fit(X, y) feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"] - if cat_missing_method == "convert": + if cat_missing_method == "convert" and not unseen_missing: feature_names.insert(2, "cat_1[(MISSING)]") feature_names.append("cat_2[(MISSING)]") np.testing.assert_array_equal(model.feature_names_, feature_names) assert len(model.coef_) == len(feature_names) - model.predict(X_unseen) + if cat_missing_method == "fail" and unseen_missing: + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + model.predict(X_unseen) + elif cat_missing_method == "convert" and unseen_missing: + with pytest.raises(ValueError, match="contains unseen categories"): + model.predict(X_unseen) + else: + model.predict(X_unseen) diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py index 36cf988a..61471750 100644 --- a/tests/glm/test_utils.py +++ b/tests/glm/test_utils.py @@ -16,12 +16,15 @@ def df(): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) def test_align_df_categories_numeric(df): dtypes = {column: np.float64 for column in df} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { @@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), expected + ) def test_align_df_categories_categorical(df): + df = df[["x5", "x6", "x7", "x8"]] dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { - "x1": [np.nan, np.nan], - "x2": [np.nan, np.nan], - "x3": [np.nan, np.nan], - "x4": [np.nan, np.nan], "x5": pd.Categorical(["a", "b"]), "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), }, dtype=pd.CategoricalDtype(["a", "b"]), ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), + expected, + ) def test_align_df_categories_excess_columns(df): dtypes = {"x1": np.float64} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { @@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), expected + ) def test_align_df_categories_missing_columns(df): dtypes = {"x0": np.float64} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { @@ -87,15 +103,69 @@ def test_align_df_categories_missing_columns(df): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), expected + ) + + +@pytest.mark.parametrize("has_missings", [False, True]) +def test_align_df_categories_convert(df, has_missings): + df = df[["x5", "x6", "x7", "x8"]] + dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df} + has_missing_category = {column: has_missings for column in df} + missing_method = "convert" + + expected = pd.DataFrame( + { + "x5": pd.Categorical(["a", "b"]), + "x6": pd.Categorical(["a", "b"]), + "x7": pd.Categorical(["a", "b"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), + }, + dtype=pd.CategoricalDtype(["a", "b"]), + ) + + if has_missings: + pd.testing.assert_frame_equal( + _align_df_categories( + df[["x5", "x6", "x7", "x8"]], + dtypes, + has_missing_category, + missing_method, + ), + expected, + ) + else: + with pytest.raises(ValueError, match="contains unseen categories"): + _align_df_categories( + df[["x5", "x6", "x7", "x8"]], + dtypes, + has_missing_category, + missing_method, + ) + + +def test_align_df_categories_raise_on_unseen(df): + dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df} + has_missing_category = {column: False for column in df} + missing_method = "fail" + + with pytest.raises(ValueError, match="contains unseen categories"): + _align_df_categories( + df, + dtypes, + has_missing_category, + missing_method, + ) def test_align_df_categories_not_df(): with pytest.raises(TypeError): - _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}) + _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail") @pytest.fixture()