Skip to content

Commit

Permalink
Informative error when encountering categories that were not seen in …
Browse files Browse the repository at this point in the history
…training (#748)

* drop missings not seen in training

* zero not drop

* better (?) name [skip ci]

* catch case of unseen missings and fail method

* fix

* respect categorical missing method with formula; test different categorical missing methods also with formula

* shorten the tests

* dont allow fitting in case of conversion of categoricals and presence of formula

* clearer error msg

* also change the error msg in the regex (facepalm)

* remove matches

* fix

* better name

* describe more restrictive behavior in tutorial

* Raise error on unseen levels when predicting

* Allow cat_missing_method='convert' again

* Update test

* Check for unseen categories

* Adapt align_df_categories tests to changes

* Make pre-commit happy

* Avoid unnecessary work

* Correctly expand penalties with categoricals and `cat_missing_method="convert"` (#753)

* Correctyl expand penalties when cat_missing_method=convert

* Add test

* Improve variable names

Co-authored-by: Matthias Schmidtblaicher <[email protected]>

---------

Co-authored-by: Matthias Schmidtblaicher <[email protected]>

* bump tabmat pre-release version

---------

Co-authored-by: Martin Stancsics <[email protected]>
  • Loading branch information
MatthiasSchmidtblaicherQC and stanmart authored Jan 29, 2024
1 parent 6b2b844 commit 1ad8be2
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 29 deletions.
2 changes: 1 addition & 1 deletion conda.recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ requirements:
- scikit-learn >=0.23
- scipy
- formulaic >=0.6
- tabmat >=4.0.0a
- tabmat >=4.0.0a3

test:
requires:
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- libblas>=0=*mkl # comment this line out for macOS arm64
- numexpr
- pandas>=0.21
- tabmat>=4.0.0a
- tabmat>=4.0.0a3
- scikit-learn>=0.23
- scipy
- tqdm
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"scikit-learn>=0.23",
"scipy",
"formulaic>=0.6",
"tabmat>=4.0.0a",
"tabmat>=4.0.0a3",
],
entry_points=None
if os.environ.get("CONDA_BUILD")
Expand Down
42 changes: 34 additions & 8 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,12 +879,18 @@ def _convert_from_pandas(
self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
) -> tm.MatrixBase:
"""Convert a pandas data frame to a tabmat matrix."""

if hasattr(self, "X_model_spec_"):
return self.X_model_spec_.get_model_matrix(df, context=context)

cat_missing_method_after_alignment = self.cat_missing_method

if hasattr(self, "feature_dtypes_"):
df = _align_df_categories(df, self.feature_dtypes_)
df = _align_df_categories(
df,
self.feature_dtypes_,
self.has_missing_category_,
self.cat_missing_method,
)
if self.cat_missing_method == "convert":
df = _add_missing_categories(
df=df,
Expand All @@ -893,12 +899,14 @@ def _convert_from_pandas(
cat_missing_name=self.cat_missing_name,
categorical_format=self.categorical_format,
)
# there should be no missing categories after this
cat_missing_method_after_alignment = "fail"

X = tm.from_pandas(
df,
drop_first=self.drop_first,
categorical_format=self.categorical_format,
cat_missing_method=self.cat_missing_method,
cat_missing_method=cat_missing_method_after_alignment,
)

return X
Expand Down Expand Up @@ -2674,6 +2682,7 @@ def _set_up_and_check_fit_args(
include_intercept=False,
ensure_full_rank=self.drop_first,
categorical_format=self.categorical_format,
cat_missing_method=self.cat_missing_method,
interaction_separator=self.interaction_separator,
add_column_for_intercept=False,
context=context,
Expand All @@ -2700,10 +2709,17 @@ def _set_up_and_check_fit_args(
# Maybe TODO: expand categorical penalties with formulas

self.feature_dtypes_ = X.dtypes.to_dict()
self.has_missing_category_ = {
col: (self.cat_missing_method == "convert") and X[col].isna().any()
for col, dtype in self.feature_dtypes_.items()
if isinstance(dtype, pd.CategoricalDtype)
}

if any(X.dtypes == "category"):

def _expand_categorical_penalties(penalty, X, drop_first):
def _expand_categorical_penalties(
penalty, X, drop_first, has_missing_category
):
"""
If P1 or P2 has the same shape as X before expanding the
categoricals, we assume that the penalty at the location of
Expand All @@ -2727,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first):
chain.from_iterable(
[
elmt
for _ in dtype.categories[int(drop_first) :]
for _ in range(
len(dtype.categories)
+ has_missing_category[col]
- drop_first
)
]
if pd.api.types.is_categorical_dtype(dtype)
else [elmt]
for elmt, dtype in zip(penalty, X.dtypes)
for elmt, (col, dtype) in zip(
penalty, X.dtypes.items()
)
)
)
)
else:
return penalty

P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
P1 = _expand_categorical_penalties(
self.P1, X, self.drop_first, self.has_missing_category_
)
P2 = _expand_categorical_penalties(
self.P2, X, self.drop_first, self.has_missing_category_
)

X = tm.from_pandas(
X,
Expand Down
22 changes: 21 additions & 1 deletion src/glum/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs):
return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)


def _align_df_categories(df, dtypes) -> pd.DataFrame:
def _align_df_categories(
df, dtypes, has_missing_category, cat_missing_method
) -> pd.DataFrame:
"""Align data types for prediction.
This function checks that categorical columns have same categories in the
Expand All @@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
----------
df : pandas.DataFrame
dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
has_missing_category : Dict[str, bool]
missing_method : str
"""
if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
Expand All @@ -47,6 +51,22 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
changed_dtypes[column] = df[column].cat.set_categories(
dtypes[column].categories
)
else:
continue

if cat_missing_method == "convert" and not has_missing_category[column]:
unseen_categories = set(df[column].unique()) - set(
dtypes[column].categories
)
else:
unseen_categories = set(df[column].dropna().unique()) - set(
dtypes[column].categories
)

if unseen_categories:
raise ValueError(
f"Column {column} contains unseen categories: {unseen_categories}."
)

if changed_dtypes:
df = df.assign(**changed_dtypes)
Expand Down
68 changes: 60 additions & 8 deletions tests/glm/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@


def get_small_x_y(
estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
) -> tuple[np.ndarray, np.ndarray]:
if isinstance(estimator, GeneralizedLinearRegressor):
n_rows = 1
Expand Down Expand Up @@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals():
np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)


def test_P1_P2_expansion_with_categoricals_missings():
rng = np.random.default_rng(42)
X = pd.DataFrame(
data={
"dense": np.linspace(0, 10, 60),
"cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
}
)
y = rng.normal(size=60)

mdl1 = GeneralizedLinearRegressor(
l1_ratio=0.01,
P1=[1, 2, 2, 2, 2, 2],
P2=[2, 1, 1, 1, 1, 1],
cat_missing_method="convert",
)
mdl1.fit(X, y)

mdl2 = GeneralizedLinearRegressor(
l1_ratio=0.01,
P1=[1, 2],
P2=[2, 1],
cat_missing_method="convert",
)
mdl2.fit(X, y)
np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)

mdl3 = GeneralizedLinearRegressor(
l1_ratio=0.01,
P1=[1, 2],
P2=sparse.diags([2, 1, 1, 1, 1, 1]),
cat_missing_method="convert",
)
mdl3.fit(X, y)
np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)


@pytest.mark.parametrize(
"estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
)
Expand Down Expand Up @@ -3183,40 +3220,55 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):


@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
def test_cat_missing(cat_missing_method):
@pytest.mark.parametrize("unseen_missing", [False, True])
@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
def test_cat_missing(cat_missing_method, unseen_missing, formula):
X = pd.DataFrame(
{
"cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
"cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
}
)
if unseen_missing:
X = X.dropna()
X_unseen = pd.DataFrame(
{
"cat_1": pd.Categorical([1, pd.NA]),
"cat_2": pd.Categorical([1, 2]),
}
)
y = np.array([1, 2, 3, 4, 5])
y = np.array(X.index)

model = GeneralizedLinearRegressor(
family="normal",
cat_missing_method=cat_missing_method,
drop_first=False,
formula=formula,
fit_intercept=False,
)

if cat_missing_method == "fail":
with pytest.raises(ValueError):
if cat_missing_method == "fail" and not unseen_missing:
with pytest.raises(
ValueError, match="Categorical data can't have missing values"
):
model.fit(X, y)
else:
model.fit(X, y)
feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]

if cat_missing_method == "convert":
if cat_missing_method == "convert" and not unseen_missing:
feature_names.insert(2, "cat_1[(MISSING)]")
feature_names.append("cat_2[(MISSING)]")

np.testing.assert_array_equal(model.feature_names_, feature_names)
assert len(model.coef_) == len(feature_names)

model.predict(X_unseen)
if cat_missing_method == "fail" and unseen_missing:
with pytest.raises(
ValueError, match="Categorical data can't have missing values"
):
model.predict(X_unseen)
elif cat_missing_method == "convert" and unseen_missing:
with pytest.raises(ValueError, match="contains unseen categories"):
model.predict(X_unseen)
else:
model.predict(X_unseen)
Loading

0 comments on commit 1ad8be2

Please sign in to comment.