From 4d574349ffcf93eacba66ac349b025f5355a5dba Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 10 Jan 2024 16:06:15 +0100 Subject: [PATCH 01/23] drop missings not seen in training --- src/glum/_glm.py | 7 +++++-- tests/glm/test_glm.py | 15 ++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 5b2f9b0d..7d3c80d6 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -880,10 +880,11 @@ def _convert_from_pandas( self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None ) -> tm.MatrixBase: """Convert a pandas data frame to a tabmat matrix.""" - if hasattr(self, "X_model_spec_"): return self.X_model_spec_.get_model_matrix(df, context=context) + cat_missing_method = self.cat_missing_method + if hasattr(self, "feature_dtypes_"): df = _align_df_categories(df, self.feature_dtypes_) if self.cat_missing_method == "convert": @@ -894,12 +895,14 @@ def _convert_from_pandas( cat_missing_name=self.cat_missing_name, categorical_format=self.categorical_format, ) + # drop categories that were not seen in training + cat_missing_method = "drop" X = tm.from_pandas( df, drop_first=self.drop_first, categorical_format=self.categorical_format, - cat_missing_method=self.cat_missing_method, + cat_missing_method=cat_missing_method, ) return X diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 590ca678..a8d998b3 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -2967,7 +2967,9 @@ def get_mixed_data(): pytest.param("y ~ c1 + 1", id="categorical_intercept"), pytest.param("y ~ x1 * c1 * c2", id="interaction"), pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_and_categorical"), - pytest.param("y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"), + pytest.param( + "y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept" + ), ], ) @pytest.mark.parametrize( @@ -3158,20 +3160,23 @@ def test_formula_predict(get_mixed_data, formula): @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) -def test_cat_missing(cat_missing_method): +@pytest.mark.parametrize("unseen_missing", [False, True]) +def test_cat_missing(cat_missing_method, unseen_missing): X = pd.DataFrame( { "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]), "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]), } ) + if unseen_missing: + X = X.dropna() X_unseen = pd.DataFrame( { "cat_1": pd.Categorical([1, pd.NA]), "cat_2": pd.Categorical([1, 2]), } ) - y = np.array([1, 2, 3, 4, 5]) + y = np.array(X.index) model = GeneralizedLinearRegressor( family="normal", @@ -3180,14 +3185,14 @@ def test_cat_missing(cat_missing_method): fit_intercept=False, ) - if cat_missing_method == "fail": + if cat_missing_method == "fail" and not unseen_missing: with pytest.raises(ValueError): model.fit(X, y) else: model.fit(X, y) feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"] - if cat_missing_method == "convert": + if cat_missing_method == "convert" and not unseen_missing: feature_names.insert(2, "cat_1[(MISSING)]") feature_names.append("cat_2[(MISSING)]") From 34ad571f71a63ada9ab83de93dcc78213fc6d817 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 10 Jan 2024 16:13:55 +0100 Subject: [PATCH 02/23] zero not drop --- src/glum/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 7d3c80d6..c4123abb 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -896,7 +896,7 @@ def _convert_from_pandas( categorical_format=self.categorical_format, ) # drop categories that were not seen in training - cat_missing_method = "drop" + cat_missing_method = "zero" X = tm.from_pandas( df, From fedd9b1d3b969a3bc952c54d47e05889cb46d05e Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 10 Jan 2024 16:16:18 +0100 Subject: [PATCH 03/23] better (?) name [skip ci] --- src/glum/_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index c4123abb..5a325bd8 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -883,7 +883,7 @@ def _convert_from_pandas( if hasattr(self, "X_model_spec_"): return self.X_model_spec_.get_model_matrix(df, context=context) - cat_missing_method = self.cat_missing_method + cat_missing_method_after_cat_check = self.cat_missing_method if hasattr(self, "feature_dtypes_"): df = _align_df_categories(df, self.feature_dtypes_) @@ -896,13 +896,13 @@ def _convert_from_pandas( categorical_format=self.categorical_format, ) # drop categories that were not seen in training - cat_missing_method = "zero" + cat_missing_method_after_cat_check = "zero" X = tm.from_pandas( df, drop_first=self.drop_first, categorical_format=self.categorical_format, - cat_missing_method=cat_missing_method, + cat_missing_method=cat_missing_method_after_cat_check, ) return X From 6d2b4310b9169dbd91411238ac96eafa8b8aeb1c Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 10 Jan 2024 16:16:18 +0100 Subject: [PATCH 04/23] catch case of unseen missings and fail method --- tests/glm/test_glm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index a8d998b3..0bc3c68e 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3186,7 +3186,7 @@ def test_cat_missing(cat_missing_method, unseen_missing): ) if cat_missing_method == "fail" and not unseen_missing: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Categorical data can't have missing values"): model.fit(X, y) else: model.fit(X, y) @@ -3199,4 +3199,8 @@ def test_cat_missing(cat_missing_method, unseen_missing): np.testing.assert_array_equal(model.feature_names_, feature_names) assert len(model.coef_) == len(feature_names) - model.predict(X_unseen) + if cat_missing_method == "fail" and unseen_missing: + with pytest.raises(ValueError, match="Categorical data can't have missing values"): + model.predict(X_unseen) + else: + model.predict(X_unseen) From 0aaf521b095a27be52058f225e6780dbf08b487b Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 10 Jan 2024 16:31:30 +0100 Subject: [PATCH 05/23] fix --- tests/glm/test_glm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 0bc3c68e..de4ff139 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3186,7 +3186,9 @@ def test_cat_missing(cat_missing_method, unseen_missing): ) if cat_missing_method == "fail" and not unseen_missing: - with pytest.raises(ValueError, match="Categorical data can't have missing values"): + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): model.fit(X, y) else: model.fit(X, y) @@ -3200,7 +3202,9 @@ def test_cat_missing(cat_missing_method, unseen_missing): assert len(model.coef_) == len(feature_names) if cat_missing_method == "fail" and unseen_missing: - with pytest.raises(ValueError, match="Categorical data can't have missing values"): + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): model.predict(X_unseen) else: model.predict(X_unseen) From dcdb326e014e28e1abc9fbe057a14b203096e355 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 09:30:44 +0100 Subject: [PATCH 06/23] respect categorical missing method with formula; test different categorical missing methods also with formula --- src/glum/_glm.py | 1 + tests/glm/test_glm.py | 55 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 5a325bd8..474b2c94 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2669,6 +2669,7 @@ def _set_up_and_check_fit_args( include_intercept=False, ensure_full_rank=self.drop_first, categorical_format=self.categorical_format, + cat_missing_method=self.cat_missing_method, interaction_separator=self.interaction_separator, add_column_for_intercept=False, context=context, diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index de4ff139..46004cd3 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3166,6 +3166,7 @@ def test_cat_missing(cat_missing_method, unseen_missing): { "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]), "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]), + "x1": [1, 2, 3, 4, 5], } ) if unseen_missing: @@ -3174,6 +3175,7 @@ def test_cat_missing(cat_missing_method, unseen_missing): { "cat_1": pd.Categorical([1, pd.NA]), "cat_2": pd.Categorical([1, 2]), + "x1": [1, 2], } ) y = np.array(X.index) @@ -3208,3 +3210,56 @@ def test_cat_missing(cat_missing_method, unseen_missing): model.predict(X_unseen) else: model.predict(X_unseen) + + +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"]) +@pytest.mark.parametrize("unseen_missing", [False, True]) +@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"]) +def test_cat_missing_formula(cat_missing_method, unseen_missing, formula): + X = pd.DataFrame( + { + "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]), + "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]), + } + ) + if unseen_missing: + X = X.dropna() + X_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical([1, pd.NA]), + "cat_2": pd.Categorical([1, 2]), + } + ) + y = np.array(X.index) + + model = GeneralizedLinearRegressor( + family="normal", + cat_missing_method=cat_missing_method, + drop_first=False, + formula=formula, + fit_intercept=False, + ) + + if cat_missing_method == "fail" and not unseen_missing: + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + model.fit(X, y) + else: + model.fit(X, y) + feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"] + + if cat_missing_method == "convert" and not unseen_missing: + feature_names.insert(2, "cat_1[(MISSING)]") + feature_names.append("cat_2[(MISSING)]") + + np.testing.assert_array_equal(model.feature_names_, feature_names) + assert len(model.coef_) == len(feature_names) + + if cat_missing_method == "fail" and unseen_missing: + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + model.predict(X_unseen) + else: + model.predict(X_unseen) From 24bfb37e332d76692d028a8e474803552b581a6c Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 09:32:45 +0100 Subject: [PATCH 07/23] shorten the tests --- tests/glm/test_glm.py | 55 +------------------------------------------ 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 46004cd3..ffc475f2 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3161,61 +3161,8 @@ def test_formula_predict(get_mixed_data, formula): @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) @pytest.mark.parametrize("unseen_missing", [False, True]) -def test_cat_missing(cat_missing_method, unseen_missing): - X = pd.DataFrame( - { - "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]), - "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]), - "x1": [1, 2, 3, 4, 5], - } - ) - if unseen_missing: - X = X.dropna() - X_unseen = pd.DataFrame( - { - "cat_1": pd.Categorical([1, pd.NA]), - "cat_2": pd.Categorical([1, 2]), - "x1": [1, 2], - } - ) - y = np.array(X.index) - - model = GeneralizedLinearRegressor( - family="normal", - cat_missing_method=cat_missing_method, - drop_first=False, - fit_intercept=False, - ) - - if cat_missing_method == "fail" and not unseen_missing: - with pytest.raises( - ValueError, match="Categorical data can't have missing values" - ): - model.fit(X, y) - else: - model.fit(X, y) - feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"] - - if cat_missing_method == "convert" and not unseen_missing: - feature_names.insert(2, "cat_1[(MISSING)]") - feature_names.append("cat_2[(MISSING)]") - - np.testing.assert_array_equal(model.feature_names_, feature_names) - assert len(model.coef_) == len(feature_names) - - if cat_missing_method == "fail" and unseen_missing: - with pytest.raises( - ValueError, match="Categorical data can't have missing values" - ): - model.predict(X_unseen) - else: - model.predict(X_unseen) - - -@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"]) -@pytest.mark.parametrize("unseen_missing", [False, True]) @pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"]) -def test_cat_missing_formula(cat_missing_method, unseen_missing, formula): +def test_cat_missing(cat_missing_method, unseen_missing, formula): X = pd.DataFrame( { "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]), From ca19b688210a6fbc4f6e30725602853ae9912354 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 10:03:15 +0100 Subject: [PATCH 08/23] dont allow fitting in case of conversion of categoricals and presence of formula --- src/glum/_glm.py | 5 +++++ tests/glm/test_glm.py | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 474b2c94..e5cf3d83 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2642,6 +2642,11 @@ def _set_up_and_check_fit_args( if isinstance(X, pd.DataFrame): if hasattr(self, "formula") and self.formula is not None: + if self.cat_missing_method == "convert": + raise NotImplementedError( + "Conversion of missing categoricals with a formula is currently not supported." + ) + lhs, rhs = _parse_formula( self.formula, include_intercept=self.fit_intercept ) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index ffc475f2..e4f9d103 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3186,7 +3186,12 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): formula=formula, fit_intercept=False, ) - + if cat_missing_method == "convert" and formula: + with pytest.raises( + NotImplementedError, + match="Conversion of missing categoricals with a formula", + ): + model.fit(X, y) if cat_missing_method == "fail" and not unseen_missing: with pytest.raises( ValueError, match="Categorical data can't have missing values" From 74a5329846e84972003c4adb5c6b78f6d993617d Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 10:12:39 +0100 Subject: [PATCH 09/23] clearer error msg --- src/glum/_glm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index e5cf3d83..fe838daf 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2644,7 +2644,8 @@ def _set_up_and_check_fit_args( if hasattr(self, "formula") and self.formula is not None: if self.cat_missing_method == "convert": raise NotImplementedError( - "Conversion of missing categoricals with a formula is currently not supported." + "cat_missing_method == 'convert' with a formula " + "is not allowed." ) lhs, rhs = _parse_formula( From e2786049d72977a1ddc5262dacbadee2f3732513 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 10:16:44 +0100 Subject: [PATCH 10/23] also change the error msg in the regex (facepalm) --- tests/glm/test_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index e4f9d103..22d763be 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3189,7 +3189,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): if cat_missing_method == "convert" and formula: with pytest.raises( NotImplementedError, - match="Conversion of missing categoricals with a formula", + match="cat_missing_method == 'convert' with a formula is not allowed", ): model.fit(X, y) if cat_missing_method == "fail" and not unseen_missing: From ab5526c26efac09f5a206b83920415926c96dc5a Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 10:28:20 +0100 Subject: [PATCH 11/23] remove matches --- tests/glm/test_glm.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 22d763be..4c9df484 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3187,15 +3187,10 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): fit_intercept=False, ) if cat_missing_method == "convert" and formula: - with pytest.raises( - NotImplementedError, - match="cat_missing_method == 'convert' with a formula is not allowed", - ): + with pytest.raises(NotImplementedError): model.fit(X, y) if cat_missing_method == "fail" and not unseen_missing: - with pytest.raises( - ValueError, match="Categorical data can't have missing values" - ): + with pytest.raises(ValueError): model.fit(X, y) else: model.fit(X, y) @@ -3209,9 +3204,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): assert len(model.coef_) == len(feature_names) if cat_missing_method == "fail" and unseen_missing: - with pytest.raises( - ValueError, match="Categorical data can't have missing values" - ): + with pytest.raises(ValueError): model.predict(X_unseen) else: model.predict(X_unseen) From ca93be8a5efce577d44403046590790a5c5c379c Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 10:38:03 +0100 Subject: [PATCH 12/23] fix --- tests/glm/test_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 4c9df484..99380249 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3189,7 +3189,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): if cat_missing_method == "convert" and formula: with pytest.raises(NotImplementedError): model.fit(X, y) - if cat_missing_method == "fail" and not unseen_missing: + elif cat_missing_method == "fail" and not unseen_missing: with pytest.raises(ValueError): model.fit(X, y) else: From 5e75f784f07124723bebfa783e20f63a6001a74f Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 11 Jan 2024 11:40:03 +0100 Subject: [PATCH 13/23] better name --- src/glum/_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index fe838daf..8f47bf74 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -883,7 +883,7 @@ def _convert_from_pandas( if hasattr(self, "X_model_spec_"): return self.X_model_spec_.get_model_matrix(df, context=context) - cat_missing_method_after_cat_check = self.cat_missing_method + cat_missing_method_after_alignment = self.cat_missing_method if hasattr(self, "feature_dtypes_"): df = _align_df_categories(df, self.feature_dtypes_) @@ -896,13 +896,13 @@ def _convert_from_pandas( categorical_format=self.categorical_format, ) # drop categories that were not seen in training - cat_missing_method_after_cat_check = "zero" + cat_missing_method_after_alignment = "zero" X = tm.from_pandas( df, drop_first=self.drop_first, categorical_format=self.categorical_format, - cat_missing_method=cat_missing_method_after_cat_check, + cat_missing_method=cat_missing_method_after_alignment, ) return X From c2d88b23bbace810ce333f45ba0d3e1647da3cfd Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Mon, 15 Jan 2024 09:51:48 +0100 Subject: [PATCH 14/23] describe more restrictive behavior in tutorial --- .../formula_interface/formula_interface.ipynb | 137 +----------------- 1 file changed, 2 insertions(+), 135 deletions(-) diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb index acdf50ea..d396adc0 100644 --- a/docs/tutorials/formula_interface/formula_interface.ipynb +++ b/docs/tutorials/formula_interface/formula_interface.ipynb @@ -1430,140 +1430,7 @@ "source": [ "### Missing Values in Categorical Columns\n", "\n", - "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n", - "\n", - "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptC(DrivAge, missing_method='zero')[0]C(DrivAge, missing_method='zero')[1]C(DrivAge, missing_method='zero')[2]C(DrivAge, missing_method='zero')[3]C(DrivAge, missing_method='zero')[4]C(DrivAge, missing_method='zero')[5]C(DrivAge, missing_method='zero')[6]C(VehPower, missing_method='convert')[4]C(VehPower, missing_method='convert')[5]C(VehPower, missing_method='convert')[6]C(VehPower, missing_method='convert')[7]C(VehPower, missing_method='convert')[8]C(VehPower, missing_method='convert')[9]
coefficient0.01.7867030.7427650.2395280.0965310.0711180.00.2010784.6372674.6793914.8633874.772634.7496734.970188
\n", - "
" - ], - "text/plain": [ - " intercept C(DrivAge, missing_method='zero')[0] \\\n", - "coefficient 0.0 1.786703 \n", - "\n", - " C(DrivAge, missing_method='zero')[1] \\\n", - "coefficient 0.742765 \n", - "\n", - " C(DrivAge, missing_method='zero')[2] \\\n", - "coefficient 0.239528 \n", - "\n", - " C(DrivAge, missing_method='zero')[3] \\\n", - "coefficient 0.096531 \n", - "\n", - " C(DrivAge, missing_method='zero')[4] \\\n", - "coefficient 0.071118 \n", - "\n", - " C(DrivAge, missing_method='zero')[5] \\\n", - "coefficient 0.0 \n", - "\n", - " C(DrivAge, missing_method='zero')[6] \\\n", - "coefficient 0.201078 \n", - "\n", - " C(VehPower, missing_method='convert')[4] \\\n", - "coefficient 4.637267 \n", - "\n", - " C(VehPower, missing_method='convert')[5] \\\n", - "coefficient 4.679391 \n", - "\n", - " C(VehPower, missing_method='convert')[6] \\\n", - "coefficient 4.863387 \n", - "\n", - " C(VehPower, missing_method='convert')[7] \\\n", - "coefficient 4.77263 \n", - "\n", - " C(VehPower, missing_method='convert')[8] \\\n", - "coefficient 4.749673 \n", - "\n", - " C(VehPower, missing_method='convert')[9] \n", - "coefficient 4.970188 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n", - "\n", - "t_glm8 = GeneralizedLinearRegressor(\n", - " family=TweedieDist,\n", - " alpha_search=True,\n", - " l1_ratio=1,\n", - " fit_intercept=False,\n", - " formula=formula_missing,\n", - "\n", - ")\n", - "t_glm8.fit(\n", - " X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n", - ")\n", - "\n", - "pd.DataFrame(\n", - " {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n", - " index=[\"intercept\"] + t_glm8.feature_names_,\n", - ").T" + "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option). The treatment of missings should be set globally by the `cat_missing_method` parameter during model initialization. The `\"convert\"` option is only valid for the interface without a formula." ] } ], @@ -1583,7 +1450,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.1" }, "orig_nbformat": 4 }, From fb59cfc02270ccd4aa3c29be5db000141cfe8ce3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 22 Jan 2024 23:34:04 +0100 Subject: [PATCH 15/23] Raise error on unseen levels when predicting --- src/glum/_util.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/glum/_util.py b/src/glum/_util.py index 24b08f40..ce734540 100644 --- a/src/glum/_util.py +++ b/src/glum/_util.py @@ -47,6 +47,14 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame: changed_dtypes[column] = df[column].cat.set_categories( dtypes[column].categories ) + else: + continue + + unseen_categories = set(df[column].unique()) - set(dtypes[column].categories) + if unseen_categories: + raise ValueError( + f"Column {column} contains unseen categories: {unseen_categories}." + ) if changed_dtypes: df = df.assign(**changed_dtypes) From 1618707af41de02103ecb937c6ee13bb3aebdde1 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 22 Jan 2024 23:45:49 +0100 Subject: [PATCH 16/23] Allow cat_missing_method='convert' again --- .../formula_interface/formula_interface.ipynb | 137 +++++++++++++++++- src/glum/_glm.py | 9 +- 2 files changed, 137 insertions(+), 9 deletions(-) diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb index d396adc0..acdf50ea 100644 --- a/docs/tutorials/formula_interface/formula_interface.ipynb +++ b/docs/tutorials/formula_interface/formula_interface.ipynb @@ -1430,7 +1430,140 @@ "source": [ "### Missing Values in Categorical Columns\n", "\n", - "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option). The treatment of missings should be set globally by the `cat_missing_method` parameter during model initialization. The `\"convert\"` option is only valid for the interface without a formula." + "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n", + "\n", + "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interceptC(DrivAge, missing_method='zero')[0]C(DrivAge, missing_method='zero')[1]C(DrivAge, missing_method='zero')[2]C(DrivAge, missing_method='zero')[3]C(DrivAge, missing_method='zero')[4]C(DrivAge, missing_method='zero')[5]C(DrivAge, missing_method='zero')[6]C(VehPower, missing_method='convert')[4]C(VehPower, missing_method='convert')[5]C(VehPower, missing_method='convert')[6]C(VehPower, missing_method='convert')[7]C(VehPower, missing_method='convert')[8]C(VehPower, missing_method='convert')[9]
coefficient0.01.7867030.7427650.2395280.0965310.0711180.00.2010784.6372674.6793914.8633874.772634.7496734.970188
\n", + "
" + ], + "text/plain": [ + " intercept C(DrivAge, missing_method='zero')[0] \\\n", + "coefficient 0.0 1.786703 \n", + "\n", + " C(DrivAge, missing_method='zero')[1] \\\n", + "coefficient 0.742765 \n", + "\n", + " C(DrivAge, missing_method='zero')[2] \\\n", + "coefficient 0.239528 \n", + "\n", + " C(DrivAge, missing_method='zero')[3] \\\n", + "coefficient 0.096531 \n", + "\n", + " C(DrivAge, missing_method='zero')[4] \\\n", + "coefficient 0.071118 \n", + "\n", + " C(DrivAge, missing_method='zero')[5] \\\n", + "coefficient 0.0 \n", + "\n", + " C(DrivAge, missing_method='zero')[6] \\\n", + "coefficient 0.201078 \n", + "\n", + " C(VehPower, missing_method='convert')[4] \\\n", + "coefficient 4.637267 \n", + "\n", + " C(VehPower, missing_method='convert')[5] \\\n", + "coefficient 4.679391 \n", + "\n", + " C(VehPower, missing_method='convert')[6] \\\n", + "coefficient 4.863387 \n", + "\n", + " C(VehPower, missing_method='convert')[7] \\\n", + "coefficient 4.77263 \n", + "\n", + " C(VehPower, missing_method='convert')[8] \\\n", + "coefficient 4.749673 \n", + "\n", + " C(VehPower, missing_method='convert')[9] \n", + "coefficient 4.970188 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n", + "\n", + "t_glm8 = GeneralizedLinearRegressor(\n", + " family=TweedieDist,\n", + " alpha_search=True,\n", + " l1_ratio=1,\n", + " fit_intercept=False,\n", + " formula=formula_missing,\n", + "\n", + ")\n", + "t_glm8.fit(\n", + " X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n", + ")\n", + "\n", + "pd.DataFrame(\n", + " {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n", + " index=[\"intercept\"] + t_glm8.feature_names_,\n", + ").T" ] } ], @@ -1450,7 +1583,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.11.4" }, "orig_nbformat": 4 }, diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 53172bb5..62afc68d 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -894,8 +894,8 @@ def _convert_from_pandas( cat_missing_name=self.cat_missing_name, categorical_format=self.categorical_format, ) - # drop categories that were not seen in training - cat_missing_method_after_alignment = "zero" + # there should be no missing categories after this + cat_missing_method_after_alignment = "fail" X = tm.from_pandas( df, @@ -2650,11 +2650,6 @@ def _set_up_and_check_fit_args( if isinstance(X, pd.DataFrame): if hasattr(self, "formula") and self.formula is not None: - if self.cat_missing_method == "convert": - raise NotImplementedError( - "cat_missing_method == 'convert' with a formula " - "is not allowed." - ) lhs, rhs = _parse_formula( self.formula, include_intercept=self.fit_intercept From c448f3dc6b3b7ce814224225ee746caf9e27c234 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 02:52:11 +0100 Subject: [PATCH 17/23] Update test --- tests/glm/test_glm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index b19eff74..0829722e 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3209,11 +3209,8 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): formula=formula, fit_intercept=False, ) - if cat_missing_method == "convert" and formula: - with pytest.raises(NotImplementedError): - model.fit(X, y) - elif cat_missing_method == "fail" and not unseen_missing: - with pytest.raises(ValueError): + if cat_missing_method == "fail" and not unseen_missing: + with pytest.raises(ValueError, match="Categorical data can't have missing values"): model.fit(X, y) else: model.fit(X, y) @@ -3227,7 +3224,10 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): assert len(model.coef_) == len(feature_names) if cat_missing_method == "fail" and unseen_missing: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Categorical data can't have missing values"): + model.predict(X_unseen) + elif cat_missing_method == "convert" and unseen_missing: + with pytest.raises(ValueError, match="contains unseen categories"): model.predict(X_unseen) else: model.predict(X_unseen) From 046d9ff9ad5558409fa02021269731cec1e6f8c4 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 03:00:13 +0100 Subject: [PATCH 18/23] Check for unseen categories --- src/glum/_glm.py | 12 ++++++++++-- src/glum/_util.py | 16 ++++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 62afc68d..43f6516e 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -885,7 +885,12 @@ def _convert_from_pandas( cat_missing_method_after_alignment = self.cat_missing_method if hasattr(self, "feature_dtypes_"): - df = _align_df_categories(df, self.feature_dtypes_) + df = _align_df_categories( + df, + self.feature_dtypes_, + self.has_missing_category_, + self.cat_missing_method, + ) if self.cat_missing_method == "convert": df = _add_missing_categories( df=df, @@ -2650,7 +2655,6 @@ def _set_up_and_check_fit_args( if isinstance(X, pd.DataFrame): if hasattr(self, "formula") and self.formula is not None: - lhs, rhs = _parse_formula( self.formula, include_intercept=self.fit_intercept ) @@ -2705,6 +2709,10 @@ def _set_up_and_check_fit_args( # Maybe TODO: expand categorical penalties with formulas self.feature_dtypes_ = X.dtypes.to_dict() + self.has_missing_category_ = { + col: (self.cat_missing_method == "convert") and X[col].isna().any() + for col in self.feature_dtypes_.keys() + } if any(X.dtypes == "category"): diff --git a/src/glum/_util.py b/src/glum/_util.py index ce734540..f5c463ff 100644 --- a/src/glum/_util.py +++ b/src/glum/_util.py @@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs): return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs) -def _align_df_categories(df, dtypes) -> pd.DataFrame: +def _align_df_categories( + df, dtypes, has_missing_category, cat_missing_method +) -> pd.DataFrame: """Align data types for prediction. This function checks that categorical columns have same categories in the @@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame: ---------- df : pandas.DataFrame dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]] + has_missing_category : Dict[str, bool] + missing_method : str """ if not isinstance(df, pd.DataFrame): raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.") @@ -50,7 +54,15 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame: else: continue - unseen_categories = set(df[column].unique()) - set(dtypes[column].categories) + if cat_missing_method == "convert" and not has_missing_category[column]: + unseen_categories = set(df[column].unique()) - set( + dtypes[column].categories + ) + else: + unseen_categories = set(df[column].dropna().unique()) - set( + dtypes[column].categories + ) + if unseen_categories: raise ValueError( f"Column {column} contains unseen categories: {unseen_categories}." From 39ce302ed982f504c6ba9d68b06448850a07bcf8 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 03:32:30 +0100 Subject: [PATCH 19/23] Adapt align_df_categories tests to changes --- tests/glm/test_utils.py | 88 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py index 36cf988a..61471750 100644 --- a/tests/glm/test_utils.py +++ b/tests/glm/test_utils.py @@ -16,12 +16,15 @@ def df(): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) def test_align_df_categories_numeric(df): dtypes = {column: np.float64 for column in df} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { @@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), expected + ) def test_align_df_categories_categorical(df): + df = df[["x5", "x6", "x7", "x8"]] dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { - "x1": [np.nan, np.nan], - "x2": [np.nan, np.nan], - "x3": [np.nan, np.nan], - "x4": [np.nan, np.nan], "x5": pd.Categorical(["a", "b"]), "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), }, dtype=pd.CategoricalDtype(["a", "b"]), ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), + expected, + ) def test_align_df_categories_excess_columns(df): dtypes = {"x1": np.float64} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { @@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), expected + ) def test_align_df_categories_missing_columns(df): dtypes = {"x0": np.float64} + has_missing_category = {column: False for column in df} + missing_method = "fail" expected = pd.DataFrame( { @@ -87,15 +103,69 @@ def test_align_df_categories_missing_columns(df): "x5": ["a", "b"], "x6": pd.Categorical(["a", "b"]), "x7": pd.Categorical(["a", "b"], categories=["b", "a"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), } ) - pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected) + pd.testing.assert_frame_equal( + _align_df_categories(df, dtypes, has_missing_category, missing_method), expected + ) + + +@pytest.mark.parametrize("has_missings", [False, True]) +def test_align_df_categories_convert(df, has_missings): + df = df[["x5", "x6", "x7", "x8"]] + dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df} + has_missing_category = {column: has_missings for column in df} + missing_method = "convert" + + expected = pd.DataFrame( + { + "x5": pd.Categorical(["a", "b"]), + "x6": pd.Categorical(["a", "b"]), + "x7": pd.Categorical(["a", "b"]), + "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]), + }, + dtype=pd.CategoricalDtype(["a", "b"]), + ) + + if has_missings: + pd.testing.assert_frame_equal( + _align_df_categories( + df[["x5", "x6", "x7", "x8"]], + dtypes, + has_missing_category, + missing_method, + ), + expected, + ) + else: + with pytest.raises(ValueError, match="contains unseen categories"): + _align_df_categories( + df[["x5", "x6", "x7", "x8"]], + dtypes, + has_missing_category, + missing_method, + ) + + +def test_align_df_categories_raise_on_unseen(df): + dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df} + has_missing_category = {column: False for column in df} + missing_method = "fail" + + with pytest.raises(ValueError, match="contains unseen categories"): + _align_df_categories( + df, + dtypes, + has_missing_category, + missing_method, + ) def test_align_df_categories_not_df(): with pytest.raises(TypeError): - _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}) + _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail") @pytest.fixture() From 099f362fd3feaf9d208cc62073dfcd18f3f5f058 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 04:11:38 +0100 Subject: [PATCH 20/23] Make pre-commit happy --- tests/glm/test_glm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 0829722e..e9d2bb3a 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -3210,7 +3210,9 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): fit_intercept=False, ) if cat_missing_method == "fail" and not unseen_missing: - with pytest.raises(ValueError, match="Categorical data can't have missing values"): + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): model.fit(X, y) else: model.fit(X, y) @@ -3224,7 +3226,9 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): assert len(model.coef_) == len(feature_names) if cat_missing_method == "fail" and unseen_missing: - with pytest.raises(ValueError, match="Categorical data can't have missing values"): + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): model.predict(X_unseen) elif cat_missing_method == "convert" and unseen_missing: with pytest.raises(ValueError, match="contains unseen categories"): From 056bf6851f64427735c9e1e1d698d3c772149184 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 25 Jan 2024 14:27:18 +0100 Subject: [PATCH 21/23] Avoid unnecessary work --- src/glum/_glm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 43f6516e..33afb37f 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2711,7 +2711,8 @@ def _set_up_and_check_fit_args( self.feature_dtypes_ = X.dtypes.to_dict() self.has_missing_category_ = { col: (self.cat_missing_method == "convert") and X[col].isna().any() - for col in self.feature_dtypes_.keys() + for col, dtype in self.feature_dtypes_.items() + if isinstance(dtype, pd.CategoricalDtype) } if any(X.dtypes == "category"): From 0b666ee330d3716c641223528366878881018df0 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 29 Jan 2024 14:35:44 +0100 Subject: [PATCH 22/23] Correctly expand penalties with categoricals and `cat_missing_method="convert"` (#753) * Correctyl expand penalties when cat_missing_method=convert * Add test * Improve variable names Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com> --------- Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com> --- src/glum/_glm.py | 22 +++++++++++++++++----- tests/glm/test_glm.py | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 33afb37f..ca26e4e6 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -2717,7 +2717,9 @@ def _set_up_and_check_fit_args( if any(X.dtypes == "category"): - def _expand_categorical_penalties(penalty, X, drop_first): + def _expand_categorical_penalties( + penalty, X, drop_first, has_missing_category + ): """ If P1 or P2 has the same shape as X before expanding the categoricals, we assume that the penalty at the location of @@ -2741,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first): chain.from_iterable( [ elmt - for _ in dtype.categories[int(drop_first) :] + for _ in range( + len(dtype.categories) + + has_missing_category[col] + - drop_first + ) ] if pd.api.types.is_categorical_dtype(dtype) else [elmt] - for elmt, dtype in zip(penalty, X.dtypes) + for elmt, (col, dtype) in zip( + penalty, X.dtypes.items() + ) ) ) ) else: return penalty - P1 = _expand_categorical_penalties(self.P1, X, self.drop_first) - P2 = _expand_categorical_penalties(self.P2, X, self.drop_first) + P1 = _expand_categorical_penalties( + self.P1, X, self.drop_first, self.has_missing_category_ + ) + P2 = _expand_categorical_penalties( + self.P2, X, self.drop_first, self.has_missing_category_ + ) X = tm.from_pandas( X, diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index e9d2bb3a..469f464e 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -53,7 +53,7 @@ def get_small_x_y( - estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] + estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV], ) -> tuple[np.ndarray, np.ndarray]: if isinstance(estimator, GeneralizedLinearRegressor): n_rows = 1 @@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals(): np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) +def test_P1_P2_expansion_with_categoricals_missings(): + rng = np.random.default_rng(42) + X = pd.DataFrame( + data={ + "dense": np.linspace(0, 10, 60), + "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0), + } + ) + y = rng.normal(size=60) + + mdl1 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2, 2, 2, 2, 2], + P2=[2, 1, 1, 1, 1, 1], + cat_missing_method="convert", + ) + mdl1.fit(X, y) + + mdl2 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=[2, 1], + cat_missing_method="convert", + ) + mdl2.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) + + mdl3 = GeneralizedLinearRegressor( + l1_ratio=0.01, + P1=[1, 2], + P2=sparse.diags([2, 1, 1, 1, 1, 1]), + cat_missing_method="convert", + ) + mdl3.fit(X, y) + np.testing.assert_allclose(mdl1.coef_, mdl3.coef_) + + @pytest.mark.parametrize( "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV] ) From 2fcbc9b53403fc9c669b4f14d69aeb54af9ea01d Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Mon, 29 Jan 2024 14:42:19 +0100 Subject: [PATCH 23/23] bump tabmat pre-release version --- conda.recipe/meta.yaml | 2 +- environment.yml | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 938db6d9..35218f7c 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -36,7 +36,7 @@ requirements: - scikit-learn >=0.23 - scipy - formulaic >=0.6 - - tabmat >=4.0.0a + - tabmat >=4.0.0a3 test: requires: diff --git a/environment.yml b/environment.yml index f621d424..d0d7d172 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: - libblas>=0=*mkl # comment this line out for macOS arm64 - numexpr - pandas>=0.21 - - tabmat>=4.0.0a + - tabmat>=4.0.0a3 - scikit-learn>=0.23 - scipy - tqdm diff --git a/setup.py b/setup.py index cf21ad77..515c68c2 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,7 @@ "scikit-learn>=0.23", "scipy", "formulaic>=0.6", - "tabmat>=4.0.0a", + "tabmat>=4.0.0a3", ], entry_points=None if os.environ.get("CONDA_BUILD")