Skip to content

Commit

Permalink
Correctly expand penalties with categoricals and `cat_missing_method=…
Browse files Browse the repository at this point in the history
…"convert"` (#753)

* Correctyl expand penalties when cat_missing_method=convert

* Add test

* Improve variable names

Co-authored-by: Matthias Schmidtblaicher <[email protected]>

---------

Co-authored-by: Matthias Schmidtblaicher <[email protected]>
  • Loading branch information
stanmart and MatthiasSchmidtblaicherQC authored Jan 29, 2024
1 parent 056bf68 commit 0b666ee
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 6 deletions.
22 changes: 17 additions & 5 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2717,7 +2717,9 @@ def _set_up_and_check_fit_args(

if any(X.dtypes == "category"):

def _expand_categorical_penalties(penalty, X, drop_first):
def _expand_categorical_penalties(
penalty, X, drop_first, has_missing_category
):
"""
If P1 or P2 has the same shape as X before expanding the
categoricals, we assume that the penalty at the location of
Expand All @@ -2741,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first):
chain.from_iterable(
[
elmt
for _ in dtype.categories[int(drop_first) :]
for _ in range(
len(dtype.categories)
+ has_missing_category[col]
- drop_first
)
]
if pd.api.types.is_categorical_dtype(dtype)
else [elmt]
for elmt, dtype in zip(penalty, X.dtypes)
for elmt, (col, dtype) in zip(
penalty, X.dtypes.items()
)
)
)
)
else:
return penalty

P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
P1 = _expand_categorical_penalties(
self.P1, X, self.drop_first, self.has_missing_category_
)
P2 = _expand_categorical_penalties(
self.P2, X, self.drop_first, self.has_missing_category_
)

X = tm.from_pandas(
X,
Expand Down
39 changes: 38 additions & 1 deletion tests/glm/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@


def get_small_x_y(
estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
) -> tuple[np.ndarray, np.ndarray]:
if isinstance(estimator, GeneralizedLinearRegressor):
n_rows = 1
Expand Down Expand Up @@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals():
np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)


def test_P1_P2_expansion_with_categoricals_missings():
rng = np.random.default_rng(42)
X = pd.DataFrame(
data={
"dense": np.linspace(0, 10, 60),
"cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
}
)
y = rng.normal(size=60)

mdl1 = GeneralizedLinearRegressor(
l1_ratio=0.01,
P1=[1, 2, 2, 2, 2, 2],
P2=[2, 1, 1, 1, 1, 1],
cat_missing_method="convert",
)
mdl1.fit(X, y)

mdl2 = GeneralizedLinearRegressor(
l1_ratio=0.01,
P1=[1, 2],
P2=[2, 1],
cat_missing_method="convert",
)
mdl2.fit(X, y)
np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)

mdl3 = GeneralizedLinearRegressor(
l1_ratio=0.01,
P1=[1, 2],
P2=sparse.diags([2, 1, 1, 1, 1, 1]),
cat_missing_method="convert",
)
mdl3.fit(X, y)
np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)


@pytest.mark.parametrize(
"estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
)
Expand Down

0 comments on commit 0b666ee

Please sign in to comment.