Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Informative error when encountering categories that were not seen in training #748

Merged
merged 27 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4d57434
drop missings not seen in training
MatthiasSchmidtblaicherQC Jan 10, 2024
34ad571
zero not drop
MatthiasSchmidtblaicherQC Jan 10, 2024
fedd9b1
better (?) name [skip ci]
MatthiasSchmidtblaicherQC Jan 10, 2024
6d2b431
catch case of unseen missings and fail method
MatthiasSchmidtblaicherQC Jan 10, 2024
0aaf521
fix
MatthiasSchmidtblaicherQC Jan 10, 2024
dcdb326
respect categorical missing method with formula; test different categ…
MatthiasSchmidtblaicherQC Jan 11, 2024
24bfb37
shorten the tests
MatthiasSchmidtblaicherQC Jan 11, 2024
ca19b68
dont allow fitting in case of conversion of categoricals and presence…
MatthiasSchmidtblaicherQC Jan 11, 2024
74a5329
clearer error msg
MatthiasSchmidtblaicherQC Jan 11, 2024
e278604
also change the error msg in the regex (facepalm)
MatthiasSchmidtblaicherQC Jan 11, 2024
ab5526c
remove matches
MatthiasSchmidtblaicherQC Jan 11, 2024
ca93be8
fix
MatthiasSchmidtblaicherQC Jan 11, 2024
5e75f78
better name
MatthiasSchmidtblaicherQC Jan 11, 2024
ee05d5d
Merge branch 'glum-v3' into convert-nas-unseen
MatthiasSchmidtblaicherQC Jan 11, 2024
b855758
Merge branch 'glum-v3' into convert-nas-unseen
MatthiasSchmidtblaicherQC Jan 15, 2024
c2d88b2
describe more restrictive behavior in tutorial
MatthiasSchmidtblaicherQC Jan 15, 2024
fc6a08a
Merge branch 'glum-v3' into convert-nas-unseen
MatthiasSchmidtblaicherQC Jan 15, 2024
8a0771a
Merge branch 'glum-v3' into convert-nas-unseen
MatthiasSchmidtblaicherQC Jan 22, 2024
fb59cfc
Raise error on unseen levels when predicting
stanmart Jan 22, 2024
1618707
Allow cat_missing_method='convert' again
stanmart Jan 22, 2024
c448f3d
Update test
stanmart Jan 23, 2024
046d9ff
Check for unseen categories
stanmart Jan 23, 2024
39ce302
Adapt align_df_categories tests to changes
stanmart Jan 23, 2024
099f362
Make pre-commit happy
stanmart Jan 23, 2024
056bf68
Avoid unnecessary work
stanmart Jan 25, 2024
0b666ee
Correctly expand penalties with categoricals and `cat_missing_method=…
stanmart Jan 29, 2024
2fcbc9b
bump tabmat pre-release version
MatthiasSchmidtblaicherQC Jan 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,12 +879,18 @@ def _convert_from_pandas(
self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
) -> tm.MatrixBase:
"""Convert a pandas data frame to a tabmat matrix."""

if hasattr(self, "X_model_spec_"):
return self.X_model_spec_.get_model_matrix(df, context=context)

cat_missing_method_after_alignment = self.cat_missing_method

if hasattr(self, "feature_dtypes_"):
df = _align_df_categories(df, self.feature_dtypes_)
df = _align_df_categories(
df,
self.feature_dtypes_,
self.has_missing_category_,
self.cat_missing_method,
)
if self.cat_missing_method == "convert":
df = _add_missing_categories(
df=df,
Expand All @@ -893,12 +899,14 @@ def _convert_from_pandas(
cat_missing_name=self.cat_missing_name,
categorical_format=self.categorical_format,
)
# there should be no missing categories after this
cat_missing_method_after_alignment = "fail"

X = tm.from_pandas(
df,
drop_first=self.drop_first,
categorical_format=self.categorical_format,
cat_missing_method=self.cat_missing_method,
cat_missing_method=cat_missing_method_after_alignment,
)

return X
Expand Down Expand Up @@ -2674,6 +2682,7 @@ def _set_up_and_check_fit_args(
include_intercept=False,
ensure_full_rank=self.drop_first,
categorical_format=self.categorical_format,
cat_missing_method=self.cat_missing_method,
interaction_separator=self.interaction_separator,
add_column_for_intercept=False,
context=context,
Expand All @@ -2700,6 +2709,10 @@ def _set_up_and_check_fit_args(
# Maybe TODO: expand categorical penalties with formulas

self.feature_dtypes_ = X.dtypes.to_dict()
self.has_missing_category_ = {
col: (self.cat_missing_method == "convert") and X[col].isna().any()
for col in self.feature_dtypes_.keys()
}

if any(X.dtypes == "category"):

Expand Down
22 changes: 21 additions & 1 deletion src/glum/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs):
return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)


def _align_df_categories(df, dtypes) -> pd.DataFrame:
def _align_df_categories(
df, dtypes, has_missing_category, cat_missing_method
) -> pd.DataFrame:
"""Align data types for prediction.

This function checks that categorical columns have same categories in the
Expand All @@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
----------
df : pandas.DataFrame
dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
has_missing_category : Dict[str, bool]
missing_method : str
"""
if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
Expand All @@ -47,6 +51,22 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
changed_dtypes[column] = df[column].cat.set_categories(
dtypes[column].categories
)
else:
continue

if cat_missing_method == "convert" and not has_missing_category[column]:
unseen_categories = set(df[column].unique()) - set(
dtypes[column].categories
)
else:
unseen_categories = set(df[column].dropna().unique()) - set(
dtypes[column].categories
)

if unseen_categories:
raise ValueError(
f"Column {column} contains unseen categories: {unseen_categories}."
)

if changed_dtypes:
df = df.assign(**changed_dtypes)
Expand Down
29 changes: 22 additions & 7 deletions tests/glm/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3183,40 +3183,55 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):


@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
def test_cat_missing(cat_missing_method):
@pytest.mark.parametrize("unseen_missing", [False, True])
@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
def test_cat_missing(cat_missing_method, unseen_missing, formula):
X = pd.DataFrame(
{
"cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
"cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
}
)
if unseen_missing:
X = X.dropna()
X_unseen = pd.DataFrame(
{
"cat_1": pd.Categorical([1, pd.NA]),
"cat_2": pd.Categorical([1, 2]),
}
)
y = np.array([1, 2, 3, 4, 5])
y = np.array(X.index)

model = GeneralizedLinearRegressor(
family="normal",
cat_missing_method=cat_missing_method,
drop_first=False,
formula=formula,
fit_intercept=False,
)

if cat_missing_method == "fail":
with pytest.raises(ValueError):
if cat_missing_method == "fail" and not unseen_missing:
with pytest.raises(
ValueError, match="Categorical data can't have missing values"
):
model.fit(X, y)
else:
model.fit(X, y)
feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]

if cat_missing_method == "convert":
if cat_missing_method == "convert" and not unseen_missing:
feature_names.insert(2, "cat_1[(MISSING)]")
feature_names.append("cat_2[(MISSING)]")

np.testing.assert_array_equal(model.feature_names_, feature_names)
assert len(model.coef_) == len(feature_names)

model.predict(X_unseen)
if cat_missing_method == "fail" and unseen_missing:
with pytest.raises(
ValueError, match="Categorical data can't have missing values"
):
model.predict(X_unseen)
elif cat_missing_method == "convert" and unseen_missing:
with pytest.raises(ValueError, match="contains unseen categories"):
model.predict(X_unseen)
else:
model.predict(X_unseen)
88 changes: 79 additions & 9 deletions tests/glm/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@ def df():
"x5": ["a", "b"],
"x6": pd.Categorical(["a", "b"]),
"x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
"x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
}
)


def test_align_df_categories_numeric(df):
dtypes = {column: np.float64 for column in df}
has_missing_category = {column: False for column in df}
missing_method = "fail"

expected = pd.DataFrame(
{
Expand All @@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df):
"x5": ["a", "b"],
"x6": pd.Categorical(["a", "b"]),
"x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
"x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
}
)

pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
pd.testing.assert_frame_equal(
_align_df_categories(df, dtypes, has_missing_category, missing_method), expected
)


def test_align_df_categories_categorical(df):
df = df[["x5", "x6", "x7", "x8"]]
dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
has_missing_category = {column: False for column in df}
missing_method = "fail"

expected = pd.DataFrame(
{
"x1": [np.nan, np.nan],
"x2": [np.nan, np.nan],
"x3": [np.nan, np.nan],
"x4": [np.nan, np.nan],
"x5": pd.Categorical(["a", "b"]),
"x6": pd.Categorical(["a", "b"]),
"x7": pd.Categorical(["a", "b"]),
"x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
},
dtype=pd.CategoricalDtype(["a", "b"]),
)

pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
pd.testing.assert_frame_equal(
_align_df_categories(df, dtypes, has_missing_category, missing_method),
expected,
)


def test_align_df_categories_excess_columns(df):
dtypes = {"x1": np.float64}
has_missing_category = {column: False for column in df}
missing_method = "fail"

expected = pd.DataFrame(
{
Expand All @@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df):
"x5": ["a", "b"],
"x6": pd.Categorical(["a", "b"]),
"x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
"x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
}
)

pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
pd.testing.assert_frame_equal(
_align_df_categories(df, dtypes, has_missing_category, missing_method), expected
)


def test_align_df_categories_missing_columns(df):
dtypes = {"x0": np.float64}
has_missing_category = {column: False for column in df}
missing_method = "fail"

expected = pd.DataFrame(
{
Expand All @@ -87,15 +103,69 @@ def test_align_df_categories_missing_columns(df):
"x5": ["a", "b"],
"x6": pd.Categorical(["a", "b"]),
"x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
"x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
}
)

pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
pd.testing.assert_frame_equal(
_align_df_categories(df, dtypes, has_missing_category, missing_method), expected
)


@pytest.mark.parametrize("has_missings", [False, True])
def test_align_df_categories_convert(df, has_missings):
df = df[["x5", "x6", "x7", "x8"]]
dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
has_missing_category = {column: has_missings for column in df}
missing_method = "convert"

expected = pd.DataFrame(
{
"x5": pd.Categorical(["a", "b"]),
"x6": pd.Categorical(["a", "b"]),
"x7": pd.Categorical(["a", "b"]),
"x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
},
dtype=pd.CategoricalDtype(["a", "b"]),
)

if has_missings:
pd.testing.assert_frame_equal(
_align_df_categories(
df[["x5", "x6", "x7", "x8"]],
dtypes,
has_missing_category,
missing_method,
),
expected,
)
else:
with pytest.raises(ValueError, match="contains unseen categories"):
_align_df_categories(
df[["x5", "x6", "x7", "x8"]],
dtypes,
has_missing_category,
missing_method,
)


def test_align_df_categories_raise_on_unseen(df):
dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
has_missing_category = {column: False for column in df}
missing_method = "fail"

with pytest.raises(ValueError, match="contains unseen categories"):
_align_df_categories(
df,
dtypes,
has_missing_category,
missing_method,
)


def test_align_df_categories_not_df():
with pytest.raises(TypeError):
_align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
_align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail")


@pytest.fixture()
Expand Down
Loading