From 960338715e5933a680ba3ede82c320d87e88f66d Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 17:31:03 +0100 Subject: [PATCH 01/10] set alpha=0 as default --- src/glum/_glm.py | 20 ++++---- tests/glm/test_distribution.py | 6 --- tests/glm/test_glm.py | 83 +++++++++++++++------------------- 3 files changed, 46 insertions(+), 63 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 312c243d..1dd9cb01 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -940,7 +940,7 @@ def _set_up_for_fit(self, y: np.ndarray) -> None: elif (self.lower_bounds is None) and (self.upper_bounds is None): if np.all(np.asarray(self.l1_ratio) == 0): self._solver = "irls-ls" - elif getattr(self, "alpha", 1) == 0 and not self.alpha_search: + elif getattr(self, "alpha", 0) == 0 and not self.alpha_search: self._solver = "irls-ls" else: self._solver = "irls-cd" @@ -2304,8 +2304,7 @@ def covariance_matrix( _expected_information = expected_information if ( - (hasattr(self, "alpha") and self.alpha is None) - or ( + ( hasattr(self, "alpha") and isinstance(self.alpha, (int, float)) and self.alpha > 0 @@ -2914,11 +2913,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): alpha : {float, array-like}, optional (default=None) Constant that multiplies the penalty terms and thus determines the regularization strength. If ``alpha_search`` is ``False`` (the default), - then ``alpha`` must be a scalar or None (equivalent to ``alpha=1.0``). + then ``alpha`` must be a scalar or None (equivalent to ``alpha=0``). If ``alpha_search`` is ``True``, then ``alpha`` must be an iterable or ``None``. See ``alpha_search`` to find how the regularization path is set if ``alpha`` is ``None``. See the notes for the exact mathematical - meaning of this parameter. ``alpha = 0`` is equivalent to unpenalized + meaning of this parameter. ``alpha=0`` is equivalent to unpenalized GLMs. In this case, the design matrix ``X`` must have full column rank (no collinearities). @@ -3146,10 +3145,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): drop_first : bool, optional (default = False) If ``True``, drop the first column when encoding categorical variables. - Set this to True when alpha=0 and solver='auto' to prevent an error due to a - singular feature matrix. In the case of using a formula with interactions, - setting this argument to ``True`` ensures structural full-rankness (it is - equivalent to ``ensure_full_rank`` in formulaic and tabmat). + Set this to True when ``alpha=0`` and ``solver='auto'`` to prevent an error + due to a singular feature matrix. In the case of using a formula with + interactions, setting this argument to ``True`` ensures structural + full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and + tabmat). robust : bool, optional (default = False) If true, then robust standard errors are computed by default. @@ -3573,7 +3573,7 @@ def fit( self.coef_ = self.coef_path_[-1] else: if self.alpha is None: - _alpha = 1.0 + _alpha = 0.0 else: _alpha = self.alpha if _alpha > 0 and self.l1_ratio > 0 and self._solver != "irls-cd": diff --git a/tests/glm/test_distribution.py b/tests/glm/test_distribution.py index d241ff07..be2a694a 100644 --- a/tests/glm/test_distribution.py +++ b/tests/glm/test_distribution.py @@ -296,7 +296,6 @@ def test_poisson_deviance_dispersion_loglihood(weighted): # logLik(glm_model) # -7.390977 (df=1) regressor = GeneralizedLinearRegressor( - alpha=0, family="poisson", fit_intercept=False, gradient_tol=1e-8, @@ -345,7 +344,6 @@ def test_gamma_deviance_dispersion_loglihood(weighted): # logLik(glm_model) # -7.057068 (df=2) regressor = GeneralizedLinearRegressor( - alpha=0, family="gamma", fit_intercept=False, gradient_tol=1e-8, @@ -393,7 +391,6 @@ def test_gaussian_deviance_dispersion_loglihood(family, weighted): # logLik(glm_model) # -7.863404 (df=2) regressor = GeneralizedLinearRegressor( - alpha=0, family=family, fit_intercept=False, gradient_tol=1e-8, @@ -441,7 +438,6 @@ def test_tweedie_deviance_dispersion_loglihood(weighted): # logLiktweedie(glm_model) # -8.35485 regressor = GeneralizedLinearRegressor( - alpha=0, family=TweedieDistribution(1.5), fit_intercept=False, gradient_tol=1e-8, @@ -490,7 +486,6 @@ def test_binomial_deviance_dispersion_loglihood(weighted): # logLik(glm_model) # -3.365058 (df=1) regressor = GeneralizedLinearRegressor( - alpha=0, family="binomial", fit_intercept=False, gradient_tol=1e-8, @@ -535,7 +530,6 @@ def test_negative_binomial_deviance_dispersion_loglihood(weighted): # logLik(glm_model) # -4.187887 (df=1) regressor = GeneralizedLinearRegressor( - alpha=0, family="negative.binomial", fit_intercept=False, gradient_tol=1e-8, diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 469f464e..c742c5a7 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -203,7 +203,7 @@ def test_gradient_tol_setting(estimator, kwargs, solver, gradient_tol): ) def test_glm_family_argument(f, fam, y, X): """Test GLM family argument set as string.""" - glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + glm = GeneralizedLinearRegressor(family=f).fit(X, y) assert isinstance(glm._family_instance, fam.__class__) @@ -373,6 +373,7 @@ def test_P1_P2_expansion_with_categoricals_missings(): y = rng.normal(size=60) mdl1 = GeneralizedLinearRegressor( + alpha=1.0, l1_ratio=0.01, P1=[1, 2, 2, 2, 2, 2], P2=[2, 1, 1, 1, 1, 1], @@ -381,6 +382,7 @@ def test_P1_P2_expansion_with_categoricals_missings(): mdl1.fit(X, y) mdl2 = GeneralizedLinearRegressor( + alpha=1.0, l1_ratio=0.01, P1=[1, 2], P2=[2, 1], @@ -390,6 +392,7 @@ def test_P1_P2_expansion_with_categoricals_missings(): np.testing.assert_allclose(mdl1.coef_, mdl2.coef_) mdl3 = GeneralizedLinearRegressor( + alpha=1.0, l1_ratio=0.01, P1=[1, 2], P2=sparse.diags([2, 1, 1, 1, 1, 1]), @@ -575,7 +578,6 @@ def test_glm_identity_regression(solver, fit_intercept, offset, convert_x_fn): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) + (0 if offset is None else offset) glm = GeneralizedLinearRegressor( - alpha=0, family="normal", link="identity", fit_intercept=fit_intercept, @@ -695,7 +697,6 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) + (0 if offset is None else offset) glm = GeneralizedLinearRegressor( - alpha=0, family="normal", link="identity", fit_intercept=fit_intercept, @@ -737,7 +738,6 @@ def test_glm_identity_regression_categorical_data(solver, offset, convert_x_fn): y = np.dot(x_mat, coef) + (0 if offset is None else offset) glm = GeneralizedLinearRegressor( - alpha=0, family="normal", link="identity", fit_intercept=False, @@ -776,7 +776,6 @@ def test_glm_log_regression(family, solver, tol, fit_intercept, offset): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef) + (0 if offset is None else offset)) glm = GeneralizedLinearRegressor( - alpha=0, family=family, link="log", fit_intercept=fit_intercept, @@ -1250,7 +1249,6 @@ def test_binomial_cloglog_unregularized(solver): sm_fit = sm_glm.fit() glum_glm = GeneralizedLinearRegressor( - alpha=0, family="binomial", link="cloglog", solver=solver, @@ -1986,7 +1984,7 @@ def test_verbose(regression_data, capsys): def test_ols_std_errors(regression_data): X, y = regression_data - mdl = GeneralizedLinearRegressor(alpha=0, family="normal") + mdl = GeneralizedLinearRegressor(family="normal") mdl.fit(X=X, y=y) mdl_sm = sm.OLS(endog=y, exog=sm.add_constant(X)) @@ -2029,9 +2027,9 @@ def test_array_std_errors(regression_data, family, fit_intercept): sm_family = sm.families.Gaussian() dispersion = None - mdl = GeneralizedLinearRegressor( - alpha=0, family=family, fit_intercept=fit_intercept - ).fit(X=X, y=y) + mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit( + X=X, y=y + ) if fit_intercept: mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family) @@ -2063,7 +2061,7 @@ def test_array_std_errors(regression_data, family, fit_intercept): def test_sparse_std_errors(regression_data): X, y = regression_data sp_X = sparse.csc_matrix(X) - mdl = GeneralizedLinearRegressor(alpha=0, family="normal") + mdl = GeneralizedLinearRegressor(family="normal") mdl.fit(X=X, y=y) actual1 = mdl.std_errors(X=sp_X, y=y, robust=False) @@ -2105,9 +2103,7 @@ def test_inputtype_std_errors(regression_data, categorical, split, fit_intercept tm.CategoricalMatrix(pd.Categorical(group, categories=categories)), ] ) - mdl = GeneralizedLinearRegressor( - alpha=0, family="normal", fit_intercept=fit_intercept - ) + mdl = GeneralizedLinearRegressor(family="normal", fit_intercept=fit_intercept) mdl.fit(X=X, y=y) if isinstance(X, tm.MatrixBase): X_sm = X.toarray() @@ -2146,7 +2142,7 @@ def test_coef_table(regression_data, fit_intercept, confidence_level): X_df = pd.DataFrame(X, columns=colnames) mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=fit_intercept + family="gaussian", fit_intercept=fit_intercept ).fit(X=X_df, y=y) if fit_intercept: @@ -2209,9 +2205,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r): sm_family = sm.families.Gaussian() dispersion = None - mdl = GeneralizedLinearRegressor( - alpha=0, family=family, fit_intercept=fit_intercept - ).fit(X=X, y=y) + mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit( + X=X, y=y + ) if fit_intercept: mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family) @@ -2283,9 +2279,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r): def test_wald_test_matrix_public(regression_data, R, r): X, y = regression_data - mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True - ).fit(X=X, y=y, store_covariance_matrix=True) + mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit( + X=X, y=y, store_covariance_matrix=True + ) assert mdl._wald_test_matrix(R, r) == mdl.wald_test(R=R, r=r) @@ -2306,9 +2302,9 @@ def test_wald_test_matrix_public(regression_data, R, r): def test_wald_test_matrix_fixed_cov(regression_data, R, r): X, y = regression_data - mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=False - ).fit(X=X, y=y, store_covariance_matrix=True) + mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=False).fit( + X=X, y=y, store_covariance_matrix=True + ) mdl_sm = sm.GLM(endog=y, exog=X, family=sm.families.Gaussian()) # Use the same covariance matrix for both so that we can use tighter tolerances @@ -2351,9 +2347,9 @@ def test_wald_test_feature_names(regression_data, names, R, r): X, y = regression_data X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) - mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True - ).fit(X=X_df, y=y, store_covariance_matrix=True) + mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit( + X=X_df, y=y, store_covariance_matrix=True + ) feature_names_results = mdl._wald_test_feature_names(names, r) if r is not None: @@ -2392,9 +2388,9 @@ def test_wald_test_feature_names_public(regression_data, names, r): X, y = regression_data X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) - mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True - ).fit(X=X_df, y=y, store_covariance_matrix=True) + mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit( + X=X_df, y=y, store_covariance_matrix=True + ) assert mdl._wald_test_feature_names(names, r) == mdl.wald_test(features=names, r=r) @@ -2449,7 +2445,7 @@ def test_wald_test_term_names(regression_data, names, R, r, r_feat): X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5)) mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True, drop_first=True + family="gaussian", fit_intercept=True, drop_first=True ).fit(X=X_df, y=y, store_covariance_matrix=True) term_names_results = mdl._wald_test_term_names(names, r) @@ -2515,7 +2511,7 @@ def test_wald_test_term_names_public(regression_data, names, R, r, r_feat): X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5)) mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True, drop_first=True + family="gaussian", fit_intercept=True, drop_first=True ).fit(X=X_df, y=y, store_covariance_matrix=True) term_names_results = mdl.wald_test(terms=names, r=r) @@ -2554,7 +2550,7 @@ def test_wald_test_formula(regression_data, formula, R, r_feat): X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True, drop_first=True + family="gaussian", fit_intercept=True, drop_first=True ).fit(X=X_df, y=y, store_covariance_matrix=True) term_names_results = mdl._wald_test_formula(formula) @@ -2599,7 +2595,7 @@ def test_wald_test_formula_public(regression_data, formula, R, r_feat): X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) mdl = GeneralizedLinearRegressor( - alpha=0, family="gaussian", fit_intercept=True, drop_first=True + family="gaussian", fit_intercept=True, drop_first=True ).fit(X=X_df, y=y, store_covariance_matrix=True) term_names_results = mdl.wald_test(formula=formula) @@ -2617,7 +2613,7 @@ def test_wald_test_formula_public(regression_data, formula, R, r_feat): def test_wald_test_raise_on_wrong_input(regression_data): X, y = regression_data - mdl = GeneralizedLinearRegressor(alpha=0, family="gaussian", fit_intercept=True) + mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True) mdl.fit(X=X, y=y) with pytest.raises(ValueError): @@ -2632,7 +2628,6 @@ def test_wald_test_raise_on_wrong_input(regression_data): @pytest.mark.parametrize("weighted", [False, True]) def test_score_method(as_data_frame, offset, weighted): regressor = GeneralizedLinearRegressor( - alpha=0, family="normal", fit_intercept=False, gradient_tol=1e-8, @@ -2666,7 +2661,7 @@ def test_score_method(as_data_frame, offset, weighted): def test_information_criteria(regression_data): X, y = regression_data - regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0) + regressor = GeneralizedLinearRegressor(family="gaussian") regressor.fit(X, y) llf = regressor.family_instance.log_likelihood(y, regressor.predict(X)) @@ -2721,10 +2716,10 @@ def test_drop_first_allows_alpha_equals_0(): rng = np.random.default_rng(42) y = np.random.normal(size=10) X = pd.DataFrame(data={"cat": pd.Categorical(rng.integers(2, size=10))}) - regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True) + regressor = GeneralizedLinearRegressor(drop_first=True) regressor.fit(X, y) - regressor = GeneralizedLinearRegressor(alpha=0) # default is False + regressor = GeneralizedLinearRegressor() # default is False with pytest.raises(np.linalg.LinAlgError): regressor.fit(X, y) @@ -2732,7 +2727,7 @@ def test_drop_first_allows_alpha_equals_0(): def test_dropping_distinct_categorical_column(): y = np.random.normal(size=10) X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10)), "num": np.ones(10)}) - regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True) + regressor = GeneralizedLinearRegressor(drop_first=True) regressor.fit(X, y) assert regressor.coef_.shape == (1,) assert regressor.feature_names_ == ["num"] @@ -2769,7 +2764,6 @@ def test_store_covariance_matrix( regressor = GeneralizedLinearRegressor( family="gaussian", - alpha=0, robust=robust, expected_information=expected_information, ) @@ -2804,7 +2798,6 @@ def test_store_covariance_matrix_formula(regression_data, formula): regressor = GeneralizedLinearRegressor( formula=formula, family="gaussian", - alpha=0, ) regressor.fit(df, y, store_covariance_matrix=True) @@ -2827,7 +2820,6 @@ def test_store_covariance_matrix_formula_errors(regression_data): regressor = GeneralizedLinearRegressor( formula=formula, family="gaussian", - alpha=0, ) regressor.fit(df, y) with pytest.raises(ValueError, match="Either X and y must be provided"): @@ -2837,7 +2829,7 @@ def test_store_covariance_matrix_formula_errors(regression_data): def test_store_covariance_matrix_errors(regression_data): X, y = regression_data - regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0) + regressor = GeneralizedLinearRegressor(family="gaussian") regressor.fit(X, y, store_covariance_matrix=False) with pytest.raises(ValueError, match="Either X and y must be provided"): @@ -3151,7 +3143,6 @@ def test_formula_against_smf(get_mixed_data, formula, fit_intercept): family="normal", drop_first=True, formula=formula, - alpha=0.0, fit_intercept=fit_intercept, ).fit(data) @@ -3174,7 +3165,6 @@ def test_formula_context(get_mixed_data): family="normal", drop_first=True, formula=formula, - alpha=0.0, fit_intercept=True, ).fit(data) @@ -3206,7 +3196,6 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept): family="normal", drop_first=True, formula=formula, - alpha=0.0, fit_intercept=fit_intercept, ).fit(data) From e5ebcf2174d453d19cde829ae355895736d6a7d2 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 17:34:19 +0100 Subject: [PATCH 02/10] fix docstring --- src/glum/_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 1dd9cb01..b5ad316c 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -3146,9 +3146,9 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): drop_first : bool, optional (default = False) If ``True``, drop the first column when encoding categorical variables. Set this to True when ``alpha=0`` and ``solver='auto'`` to prevent an error - due to a singular feature matrix. In the case of using a formula with - interactions, setting this argument to ``True`` ensures structural - full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and + due to a singular feature matrix. In the case of using a formula with + interactions, setting this argument to ``True`` ensures structural + full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and tabmat). robust : bool, optional (default = False) From 55d9bb5617294d151b9a6a787a70da88d0b98779 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 19:07:06 +0100 Subject: [PATCH 03/10] add alpha where needed to avoid LinAlgError --- tests/glm/test_glm.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index c742c5a7..fe552d79 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -430,7 +430,7 @@ def test_glm_fit_intercept_argument(estimator, fit_intercept): ) def test_glm_solver_argument(estimator, solver, l1_ratio, y, X): """Test GLM for invalid solver argument.""" - glm = estimator(solver=solver, l1_ratio=l1_ratio) + glm = estimator(solver=solver, l1_ratio=l1_ratio, alpha=1.0) with pytest.raises(ValueError): glm.fit(X, y) @@ -479,7 +479,10 @@ def test_glm_warm_start_argument(estimator, warm_start): def test_glm_warm_start_with_constant_column(estimator): X, y = make_regression() X[:, 0] = 0 - glm = estimator(warm_start=True) + kwargs = {"warm_start": True} + if estimator == GeneralizedLinearRegressor: + kwargs["alpha"] = 1.0 + glm = estimator(**kwargs) glm.fit(X, y) glm.fit(X, y) @@ -1310,11 +1313,11 @@ def test_binomial_enet(alpha): @pytest.mark.parametrize( "params", [ - {"solver": "irls-ls"}, - {"solver": "lbfgs"}, - {"solver": "trust-constr"}, - {"solver": "irls-cd", "selection": "cyclic"}, - {"solver": "irls-cd", "selection": "random"}, + {"solver": "irls-ls", "alpha": 1.0}, + {"solver": "lbfgs", "alpha": 1.0}, + {"solver": "trust-constr", "alpha": 1.0}, + {"solver": "irls-cd", "selection": "cyclic", "alpha": 1.0}, + {"solver": "irls-cd", "selection": "random", "alpha": 1.0}, ], ids=lambda params: ", ".join(f"{key}={val}" for key, val in params.items()), ) @@ -1326,7 +1329,7 @@ def test_solver_equivalence(params, use_offset, regression_data): offset = np.random.random(len(y)) else: offset = None - est_ref = GeneralizedLinearRegressor(random_state=2) + est_ref = GeneralizedLinearRegressor(random_state=2, alpha=1.0) est_ref.fit(X, y, offset=offset) est_2 = GeneralizedLinearRegressor(**params) @@ -1801,7 +1804,7 @@ def test_passing_noncontiguous_as_X(): ) def test_feature_names_underscores(X, feature_names): model = GeneralizedLinearRegressor( - family="poisson", categorical_format="{name}__{category}" + family="poisson", categorical_format="{name}__{category}", alpha=1.0 ).fit(X, np.arange(5)) np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names) @@ -1848,7 +1851,7 @@ def test_feature_names_underscores(X, feature_names): ) def test_feature_names_brackets(X, feature_names): model = GeneralizedLinearRegressor( - family="poisson", categorical_format="{name}[{category}]" + family="poisson", categorical_format="{name}[{category}]", alpha=1.0 ).fit(X, np.arange(5)) np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names) @@ -1889,7 +1892,7 @@ def test_feature_names_brackets(X, feature_names): ], ) def test_term_names(X, term_names): - model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5)) + model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5)) np.testing.assert_array_equal(getattr(model, "term_names_", None), term_names) @@ -1905,7 +1908,7 @@ def test_term_names(X, term_names): ], ) def test_feature_dtypes(X, dtypes): - model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5)) + model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5)) np.testing.assert_array_equal(getattr(model, "feature_dtypes_", None), dtypes) @@ -1928,12 +1931,12 @@ def test_categorical_types(k, n): # use categorical types X_cat = pd.DataFrame({"group": pd.Categorical(group, categories=categories)}) - model_cat = GeneralizedLinearRegressor(family="poisson").fit(X_cat, y) + model_cat = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_cat, y) pred_cat = model_cat.predict(X_cat) # use one-hot encoding X_oh = pd.get_dummies(X_cat, dtype=float) - model_oh = GeneralizedLinearRegressor(family="poisson").fit(X_oh, y) + model_oh = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_oh, y) pred_oh = model_oh.predict(X_oh) # check predictions @@ -3014,6 +3017,7 @@ def test_formula(get_mixed_data, formula, drop_first, fit_intercept): formula=formula, fit_intercept=fit_intercept, categorical_format="{name}[T.{category}]", + alpha=1.0, ).fit(data) if fit_intercept: @@ -3033,6 +3037,7 @@ def test_formula(get_mixed_data, formula, drop_first, fit_intercept): drop_first=drop_first, fit_intercept=fit_intercept, categorical_format="{name}[T.{category}]", + alpha=1.0, ).fit(X_ext, y_ext) np.testing.assert_almost_equal(model_ext.coef_, model_formula.coef_) @@ -3083,6 +3088,7 @@ def test_formula_names_formulaic_style( formula=formula, categorical_format="{name}[T.{category}]", interaction_separator=":", + alpha=1.0, ).fit(data) np.testing.assert_array_equal(model_formula.feature_names_, feature_names) @@ -3120,6 +3126,7 @@ def test_formula_names_old_glum_style( formula=formula, categorical_format="{name}__{category}", interaction_separator="__x__", + alpha=1.0, ).fit(data) np.testing.assert_array_equal(model_formula.feature_names_, feature_names) @@ -3234,6 +3241,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula): drop_first=False, formula=formula, fit_intercept=False, + alpha=1.0, ) if cat_missing_method == "fail" and not unseen_missing: with pytest.raises( From e82f59731d32c3862f262502b336cf304bb2162e Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 19:12:06 +0100 Subject: [PATCH 04/10] add changelog entry --- CHANGELOG.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 72c6933d..0f7b3090 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,10 @@ Changelog 3.0.0 - UNRELEASED ------------------ +**Breaking change:** + +- :class:`~glum.GeneralizedLinearRegressor`'s default value for `alpha` is now `0`, i.e. no regularization. + **New features:** - Added a formula interface for specifying models. From d70e2a655acd2d2c1112fe148761e0b12c8858b4 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 19:19:17 +0100 Subject: [PATCH 05/10] also set alpha in golden master --- tests/glm/test_glm.py | 5 ++++- tests/glm/test_golden_master.py | 19 +++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index fe552d79..08080212 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -430,7 +430,10 @@ def test_glm_fit_intercept_argument(estimator, fit_intercept): ) def test_glm_solver_argument(estimator, solver, l1_ratio, y, X): """Test GLM for invalid solver argument.""" - glm = estimator(solver=solver, l1_ratio=l1_ratio, alpha=1.0) + kwargs = {"solver": solver, "l1_ratio": l1_ratio} + if estimator == GeneralizedLinearRegressor: + kwargs["alpha"] = 1.0 + glm = estimator(**kwargs) with pytest.raises(ValueError): glm.fit(X, y) diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py index ac00f91b..8626256d 100644 --- a/tests/glm/test_golden_master.py +++ b/tests/glm/test_golden_master.py @@ -100,25 +100,32 @@ def expected_all(): gm_model_parameters = { - "default": {}, # default params - "half-regularization": {"alpha": 0.5}, # regularization (other than alpha = 1) - "elastic-net": {"l1_ratio": 0.5}, # elastic-net - "lasso": {"l1_ratio": 1}, # lasso + "regularization": {"alpha": 1.0}, # regularization with alpha = 1 + "half-regularization": {"alpha": 0.5}, # regularization with alpha = 0 + "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0}, # elastic-net + "lasso": {"l1_ratio": 1, "alpha": 1.0}, # lasso "variable_p1": { "l1_ratio": 1, "P1": np.arange(30) / 10, + "alpha": 1.0, }, # lasso with variable penalty "variable_p2": { "l1_ratio": 0, "P2": _make_P2(), + "alpha": 1.0, }, # ridge with Tikhonov regularization "variable_p1_p2": { "l1_ratio": 0.5, "P1": np.arange(30) / 10, "P2": _make_P2(), + "alpha": 1.0, }, # elastic net with P1 and P2 variable penalty - "fit_intercept": {"fit_intercept": False}, # do not fit the intercept - "bounds": {"lower_bounds": np.full(30, 0), "upper_bounds": np.full(30, 0.4)}, + "fit_intercept": {"fit_intercept": False, "alpha": 1.0}, # do not fit the intercept + "bounds": { + "lower_bounds": np.full(30, 0), + "upper_bounds": np.full(30, 0.4), + "alpha": 1.0, + }, } From 31b0b2e497c2f1e6ec0c5a623a473b5080e788cf Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 19:34:33 +0100 Subject: [PATCH 06/10] change name in persisted file too --- tests/glm/golden_master/simulation_gm.json | 40 +++++++++++----------- tests/glm/test_golden_master.py | 1 + 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/glm/golden_master/simulation_gm.json b/tests/glm/golden_master/simulation_gm.json index 3c69f59b..660413de 100644 --- a/tests/glm/golden_master/simulation_gm.json +++ b/tests/glm/golden_master/simulation_gm.json @@ -1,6 +1,6 @@ { "normal": { - "default_weights_offset": { + "regularization_weights_offset": { "coef_": [ 0.5027665204024282, 0.23449539956055546, @@ -36,7 +36,7 @@ "intercept_": 3.026490229054092, "n_iter_": 1 }, - "default_weights": { + "regularization_weights": { "coef_": [ 0.5012056522046088, 0.23528722263235485, @@ -72,7 +72,7 @@ "intercept_": 2.0279948791150764, "n_iter_": 1 }, - "default_offset": { + "regularization_offset": { "coef_": [ 0.49784759015593427, 0.23166926058137094, @@ -108,7 +108,7 @@ "intercept_": 2.981778440705444, "n_iter_": 1 }, - "default": { + "regularization": { "coef_": [ 0.4985676422254175, 0.22818569911229844, @@ -1478,7 +1478,7 @@ } }, "poisson": { - "default_weights_offset": { + "regularization_weights_offset": { "coef_": [ 0.9604408672344522, 0.4432562524921413, @@ -1514,7 +1514,7 @@ "intercept_": 1.8189178943867188, "n_iter_": 6 }, - "default_weights": { + "regularization_weights": { "coef_": [ 0.9817372866211753, 0.49117907395980553, @@ -1550,7 +1550,7 @@ "intercept_": 1.157828764208921, "n_iter_": 6 }, - "default_offset": { + "regularization_offset": { "coef_": [ 0.9693196874148616, 0.46707910961062293, @@ -1586,7 +1586,7 @@ "intercept_": 1.8396971485658087, "n_iter_": 6 }, - "default": { + "regularization": { "coef_": [ 0.9821298947770232, 0.4937841900606277, @@ -2812,7 +2812,7 @@ } }, "gamma": { - "default_weights_offset": { + "regularization_weights_offset": { "coef_": [ 0.4866808417045077, 0.1370793228217412, @@ -2848,7 +2848,7 @@ "intercept_": 5.268950639816242, "n_iter_": 4 }, - "default_weights": { + "regularization_weights": { "coef_": [ 0.48972345202083134, 0.24707128799109493, @@ -2884,7 +2884,7 @@ "intercept_": 2.512993119536852, "n_iter_": 4 }, - "default_offset": { + "regularization_offset": { "coef_": [ 0.5107634971640694, 0.1783139942111257, @@ -2920,7 +2920,7 @@ "intercept_": 5.272870219406924, "n_iter_": 4 }, - "default": { + "regularization": { "coef_": [ 0.4966531683982075, 0.24896254652599858, @@ -4146,7 +4146,7 @@ } }, "tweedie_p=1.5": { - "default_weights_offset": { + "regularization_weights_offset": { "coef_": [ 0.8740584736837378, 0.39026903329437757, @@ -4182,7 +4182,7 @@ "intercept_": 2.8380327257627473, "n_iter_": 4 }, - "default_weights": { + "regularization_weights": { "coef_": [ 0.8592854961617753, 0.42694459825027725, @@ -4218,7 +4218,7 @@ "intercept_": 1.6496674803774887, "n_iter_": 4 }, - "default_offset": { + "regularization_offset": { "coef_": [ 0.8763610403720393, 0.4023951463085115, @@ -4254,7 +4254,7 @@ "intercept_": 2.7855262434295343, "n_iter_": 4 }, - "default": { + "regularization": { "coef_": [ 0.860178238544325, 0.43000049156945763, @@ -5480,7 +5480,7 @@ } }, "binomial": { - "default_weights_offset": { + "regularization_weights_offset": { "coef_": [ 0.0645115293284631, 0.03563706184469416, @@ -5516,7 +5516,7 @@ "intercept_": 3.3761974509366994, "n_iter_": 3 }, - "default_weights": { + "regularization_weights": { "coef_": [ 0.06396142685405831, 0.03544619397195947, @@ -5552,7 +5552,7 @@ "intercept_": 2.007458821879875, "n_iter_": 2 }, - "default_offset": { + "regularization_offset": { "coef_": [ 0.059850128940604715, 0.029620907232596274, @@ -5588,7 +5588,7 @@ "intercept_": 3.4202998674202676, "n_iter_": 3 }, - "default": { + "regularization": { "coef_": [ 0.05979957149348005, 0.03233408720147587, diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py index 8626256d..dc6d5646 100644 --- a/tests/glm/test_golden_master.py +++ b/tests/glm/test_golden_master.py @@ -100,6 +100,7 @@ def expected_all(): gm_model_parameters = { + # TODO add an unregularized case "regularization": {"alpha": 1.0}, # regularization with alpha = 1 "half-regularization": {"alpha": 0.5}, # regularization with alpha = 0 "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0}, # elastic-net From 643b231a8aa54f4c710eba49343ec694014e1e32 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 19:45:54 +0100 Subject: [PATCH 07/10] set alpha in model_parameters again --- tests/glm/test_golden_master.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py index dc6d5646..716edfff 100644 --- a/tests/glm/test_golden_master.py +++ b/tests/glm/test_golden_master.py @@ -215,13 +215,13 @@ def test_gm_storage(distribution, data_all_storage, expected_all): model = fit_model( data=data, family=distribution, - model_parameters={}, + model_parameters={"alpha": 1.0}, use_weights=False, use_offset=False, cv=False, ) - run_name = "default" + run_name = "regularization" expected = expected_all[distribution][run_name] assert_gm_allclose(model, expected) @@ -234,7 +234,7 @@ def test_gm_custom_link(family_link, use_weights, use_offset, data_all, expected """Currently only testing log-linear model.""" distribution, link = family_link data = data_all[distribution] - model_parameters = {"link": link} + model_parameters = {"link": link, "alpha": 1.0} model = fit_model( data=data, family=distribution, @@ -265,9 +265,7 @@ def test_gm_approx_hessian( distribution, use_weights, use_offset, data_all, expected_all ): data = data_all[distribution] - model_parameters = { - "hessian_approx": 0.1, - } + model_parameters = {"hessian_approx": 0.1, "alpha": 1.0} model = fit_model( data=data, family=distribution, @@ -277,7 +275,7 @@ def test_gm_approx_hessian( cv=False, ) - run_name = "default" + run_name = "regularization" if use_weights: run_name = f"{run_name}_weights" if use_offset: @@ -298,6 +296,7 @@ def test_gm_cv(distribution, data_all, expected_all): "alphas": [0.1, 0.05, 0.01], "l1_ratio": [0.2, 0.5, 0.9], "cv": 3, + "alpha": 1.0, } model = fit_model( data=data, @@ -453,7 +452,7 @@ def run_and_store_golden_master( for use_offset in [True, False]: gm_dict = run_and_store_golden_master( distribution=dist, - model_parameters={"link": link}, + model_parameters={"link": link, "alpha": 1.0}, run_name=f"custom-{dist}-{link}", use_weights=use_weights, use_offset=use_offset, From 86eeb5e9146bfb1c5f258e6a5ad9555d9f9b02e6 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 20:04:59 +0100 Subject: [PATCH 08/10] don't modify case of no alpha attribute, which is RegressorCV --- src/glum/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index b5ad316c..9c9fb53d 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -940,7 +940,7 @@ def _set_up_for_fit(self, y: np.ndarray) -> None: elif (self.lower_bounds is None) and (self.upper_bounds is None): if np.all(np.asarray(self.l1_ratio) == 0): self._solver = "irls-ls" - elif getattr(self, "alpha", 0) == 0 and not self.alpha_search: + elif getattr(self, "alpha", 1) == 0 and not self.alpha_search: self._solver = "irls-ls" else: self._solver = "irls-cd" From 012c90402ffcb177d0146dbae2eca9dffe3ab110 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Wed, 31 Jan 2024 20:05:56 +0100 Subject: [PATCH 09/10] remove invalid alpha argument --- tests/glm/test_golden_master.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py index 716edfff..d58a45f6 100644 --- a/tests/glm/test_golden_master.py +++ b/tests/glm/test_golden_master.py @@ -296,7 +296,6 @@ def test_gm_cv(distribution, data_all, expected_all): "alphas": [0.1, 0.05, 0.01], "l1_ratio": [0.2, 0.5, 0.9], "cv": 3, - "alpha": 1.0, } model = fit_model( data=data, From f9052c92e45915ab28eeb1a112e4ff099e588cd2 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 1 Feb 2024 09:04:41 +0100 Subject: [PATCH 10/10] wording --- tests/glm/test_golden_master.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py index d58a45f6..4f70c6a7 100644 --- a/tests/glm/test_golden_master.py +++ b/tests/glm/test_golden_master.py @@ -101,8 +101,8 @@ def expected_all(): gm_model_parameters = { # TODO add an unregularized case - "regularization": {"alpha": 1.0}, # regularization with alpha = 1 - "half-regularization": {"alpha": 0.5}, # regularization with alpha = 0 + "regularization": {"alpha": 1.0}, # default prior to v3 + "half-regularization": {"alpha": 0.5}, # regularization with alpha = 0.5 "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0}, # elastic-net "lasso": {"l1_ratio": 1, "alpha": 1.0}, # lasso "variable_p1": {