From 3a30eb092870f6a094da541bafd0669dba036210 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 04:18:48 +0100 Subject: [PATCH] Properly handle missings when checking for unseen --- src/tabmat/formula.py | 19 +++++++++++++------ tests/test_formula.py | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 07d28deb..f6db3586 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -429,7 +429,7 @@ def from_categorical( reduced_rank: bool, missing_method: str = "fail", missing_name: str = "(MISSING)", - force_convert: bool = False, + add_category_for_nan: bool = False, ) -> "_InteractableCategoricalVector": """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) @@ -446,7 +446,7 @@ def from_categorical( "if cat_missing_method='fail'." ) - if missing_method == "convert" and (-1 in codes or force_convert): + if missing_method == "convert" and (-1 in codes or add_category_for_nan): codes[codes == -1] = len(categories) categories.append(missing_name) @@ -723,10 +723,15 @@ def encode_contrasts( order to avoid spanning the intercept. """ levels = levels if levels is not None else _state.get("categories") - force_convert = _state.get("force_convert", False) + add_category_for_nan = _state.get("add_category_for_nan", False) + # Check for unseen categories when levels are specified if levels is not None: - unseen_categories = set(data.dropna().unique()) - set(levels) + if missing_method == "convert" and not add_category_for_nan: + unseen_categories = set(data.unique()) - set(levels) + else: + unseen_categories = set(data.dropna().unique()) - set(levels) + if unseen_categories: raise ValueError( f"Column {data.name} contains unseen categories: {unseen_categories}." @@ -734,14 +739,16 @@ def encode_contrasts( cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories - _state["force_convert"] = missing_method == "convert" and cat.isna().any() + _state["add_category_for_nan"] = add_category_for_nan or ( + missing_method == "convert" and cat.isna().any() + ) return _InteractableCategoricalVector.from_categorical( cat, reduced_rank=reduced_rank, missing_method=missing_method, missing_name=missing_name, - force_convert=force_convert, + add_category_for_nan=add_category_for_nan, ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 1319bdda..a7b849d9 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -747,7 +747,7 @@ def test_cat_missing_interactions(): @pytest.mark.parametrize( - "cat_missing_method", ["zero", "convert"], ids=["zero", "convert"] + "cat_missing_method", ["zero", "convert", "fail"], ids=["zero", "convert", "fail"] ) def test_unseen_category(cat_missing_method): df = pd.DataFrame( @@ -768,6 +768,23 @@ def test_unseen_category(cat_missing_method): result_seen.model_spec.get_model_matrix(df_unseen) +def test_unseen_missing_convert(): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", pd.NA]), + } + ) + result_seen = tm.from_formula("cat_1 - 1", df, cat_missing_method="convert") + + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + + # Tests from formulaic's test suite # ---------------------------------