From 960338715e5933a680ba3ede82c320d87e88f66d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 17:31:03 +0100
Subject: [PATCH 01/10] set alpha=0 as default

---
 src/glum/_glm.py               | 20 ++++----
 tests/glm/test_distribution.py |  6 ---
 tests/glm/test_glm.py          | 83 +++++++++++++++-------------------
 3 files changed, 46 insertions(+), 63 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 312c243d6..1dd9cb010 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -940,7 +940,7 @@ def _set_up_for_fit(self, y: np.ndarray) -> None:
             elif (self.lower_bounds is None) and (self.upper_bounds is None):
                 if np.all(np.asarray(self.l1_ratio) == 0):
                     self._solver = "irls-ls"
-                elif getattr(self, "alpha", 1) == 0 and not self.alpha_search:
+                elif getattr(self, "alpha", 0) == 0 and not self.alpha_search:
                     self._solver = "irls-ls"
                 else:
                     self._solver = "irls-cd"
@@ -2304,8 +2304,7 @@ def covariance_matrix(
             _expected_information = expected_information
 
         if (
-            (hasattr(self, "alpha") and self.alpha is None)
-            or (
+            (
                 hasattr(self, "alpha")
                 and isinstance(self.alpha, (int, float))
                 and self.alpha > 0
@@ -2914,11 +2913,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     alpha : {float, array-like}, optional (default=None)
         Constant that multiplies the penalty terms and thus determines the
         regularization strength. If ``alpha_search`` is ``False`` (the default),
-        then ``alpha`` must be a scalar or None (equivalent to ``alpha=1.0``).
+        then ``alpha`` must be a scalar or None (equivalent to ``alpha=0``).
         If ``alpha_search`` is ``True``, then ``alpha`` must be an iterable or
         ``None``. See ``alpha_search`` to find how the regularization path is
         set if ``alpha`` is ``None``. See the notes for the exact mathematical
-        meaning of this parameter. ``alpha = 0`` is equivalent to unpenalized
+        meaning of this parameter. ``alpha=0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix ``X`` must have full column rank
         (no collinearities).
 
@@ -3146,10 +3145,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
 
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
-        Set this to True when alpha=0 and solver='auto' to prevent an error due to a
-        singular feature matrix. In the case of using a formula with interactions,
-        setting this argument to ``True`` ensures structural full-rankness (it is
-        equivalent to ``ensure_full_rank`` in formulaic and tabmat).
+        Set this to True when ``alpha=0`` and ``solver='auto'`` to prevent an error
+        due to a singular feature matrix. In the case of using a formula with 
+        interactions, setting this argument to ``True`` ensures structural 
+        full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and 
+        tabmat).
 
     robust : bool, optional (default = False)
         If true, then robust standard errors are computed by default.
@@ -3573,7 +3573,7 @@ def fit(
                 self.coef_ = self.coef_path_[-1]
         else:
             if self.alpha is None:
-                _alpha = 1.0
+                _alpha = 0.0
             else:
                 _alpha = self.alpha
             if _alpha > 0 and self.l1_ratio > 0 and self._solver != "irls-cd":
diff --git a/tests/glm/test_distribution.py b/tests/glm/test_distribution.py
index d241ff078..be2a694af 100644
--- a/tests/glm/test_distribution.py
+++ b/tests/glm/test_distribution.py
@@ -296,7 +296,6 @@ def test_poisson_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -7.390977 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="poisson",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -345,7 +344,6 @@ def test_gamma_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -7.057068 (df=2)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="gamma",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -393,7 +391,6 @@ def test_gaussian_deviance_dispersion_loglihood(family, weighted):
     # logLik(glm_model)  # -7.863404 (df=2)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family=family,
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -441,7 +438,6 @@ def test_tweedie_deviance_dispersion_loglihood(weighted):
     # logLiktweedie(glm_model)  # -8.35485
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family=TweedieDistribution(1.5),
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -490,7 +486,6 @@ def test_binomial_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -3.365058 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="binomial",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -535,7 +530,6 @@ def test_negative_binomial_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -4.187887 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="negative.binomial",
         fit_intercept=False,
         gradient_tol=1e-8,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 469f464e9..c742c5a7f 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -203,7 +203,7 @@ def test_gradient_tol_setting(estimator, kwargs, solver, gradient_tol):
 )
 def test_glm_family_argument(f, fam, y, X):
     """Test GLM family argument set as string."""
-    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    glm = GeneralizedLinearRegressor(family=f).fit(X, y)
     assert isinstance(glm._family_instance, fam.__class__)
 
 
@@ -373,6 +373,7 @@ def test_P1_P2_expansion_with_categoricals_missings():
     y = rng.normal(size=60)
 
     mdl1 = GeneralizedLinearRegressor(
+        alpha=1.0,
         l1_ratio=0.01,
         P1=[1, 2, 2, 2, 2, 2],
         P2=[2, 1, 1, 1, 1, 1],
@@ -381,6 +382,7 @@ def test_P1_P2_expansion_with_categoricals_missings():
     mdl1.fit(X, y)
 
     mdl2 = GeneralizedLinearRegressor(
+        alpha=1.0,
         l1_ratio=0.01,
         P1=[1, 2],
         P2=[2, 1],
@@ -390,6 +392,7 @@ def test_P1_P2_expansion_with_categoricals_missings():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
     mdl3 = GeneralizedLinearRegressor(
+        alpha=1.0,
         l1_ratio=0.01,
         P1=[1, 2],
         P2=sparse.diags([2, 1, 1, 1, 1, 1]),
@@ -575,7 +578,6 @@ def test_glm_identity_regression(solver, fit_intercept, offset, convert_x_fn):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef) + (0 if offset is None else offset)
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=fit_intercept,
@@ -695,7 +697,6 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef) + (0 if offset is None else offset)
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=fit_intercept,
@@ -737,7 +738,6 @@ def test_glm_identity_regression_categorical_data(solver, offset, convert_x_fn):
     y = np.dot(x_mat, coef) + (0 if offset is None else offset)
 
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=False,
@@ -776,7 +776,6 @@ def test_glm_log_regression(family, solver, tol, fit_intercept, offset):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef) + (0 if offset is None else offset))
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family=family,
         link="log",
         fit_intercept=fit_intercept,
@@ -1250,7 +1249,6 @@ def test_binomial_cloglog_unregularized(solver):
     sm_fit = sm_glm.fit()
 
     glum_glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="binomial",
         link="cloglog",
         solver=solver,
@@ -1986,7 +1984,7 @@ def test_verbose(regression_data, capsys):
 
 def test_ols_std_errors(regression_data):
     X, y = regression_data
-    mdl = GeneralizedLinearRegressor(alpha=0, family="normal")
+    mdl = GeneralizedLinearRegressor(family="normal")
     mdl.fit(X=X, y=y)
 
     mdl_sm = sm.OLS(endog=y, exog=sm.add_constant(X))
@@ -2029,9 +2027,9 @@ def test_array_std_errors(regression_data, family, fit_intercept):
         sm_family = sm.families.Gaussian()
         dispersion = None
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family=family, fit_intercept=fit_intercept
-    ).fit(X=X, y=y)
+    mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit(
+        X=X, y=y
+    )
 
     if fit_intercept:
         mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family)
@@ -2063,7 +2061,7 @@ def test_array_std_errors(regression_data, family, fit_intercept):
 def test_sparse_std_errors(regression_data):
     X, y = regression_data
     sp_X = sparse.csc_matrix(X)
-    mdl = GeneralizedLinearRegressor(alpha=0, family="normal")
+    mdl = GeneralizedLinearRegressor(family="normal")
     mdl.fit(X=X, y=y)
 
     actual1 = mdl.std_errors(X=sp_X, y=y, robust=False)
@@ -2105,9 +2103,7 @@ def test_inputtype_std_errors(regression_data, categorical, split, fit_intercept
                     tm.CategoricalMatrix(pd.Categorical(group, categories=categories)),
                 ]
             )
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="normal", fit_intercept=fit_intercept
-    )
+    mdl = GeneralizedLinearRegressor(family="normal", fit_intercept=fit_intercept)
     mdl.fit(X=X, y=y)
     if isinstance(X, tm.MatrixBase):
         X_sm = X.toarray()
@@ -2146,7 +2142,7 @@ def test_coef_table(regression_data, fit_intercept, confidence_level):
     X_df = pd.DataFrame(X, columns=colnames)
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=fit_intercept
+        family="gaussian", fit_intercept=fit_intercept
     ).fit(X=X_df, y=y)
 
     if fit_intercept:
@@ -2209,9 +2205,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r):
         sm_family = sm.families.Gaussian()
         dispersion = None
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family=family, fit_intercept=fit_intercept
-    ).fit(X=X, y=y)
+    mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit(
+        X=X, y=y
+    )
 
     if fit_intercept:
         mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family)
@@ -2283,9 +2279,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r):
 def test_wald_test_matrix_public(regression_data, R, r):
     X, y = regression_data
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X, y=y, store_covariance_matrix=True
+    )
 
     assert mdl._wald_test_matrix(R, r) == mdl.wald_test(R=R, r=r)
 
@@ -2306,9 +2302,9 @@ def test_wald_test_matrix_public(regression_data, R, r):
 def test_wald_test_matrix_fixed_cov(regression_data, R, r):
     X, y = regression_data
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=False
-    ).fit(X=X, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=False).fit(
+        X=X, y=y, store_covariance_matrix=True
+    )
     mdl_sm = sm.GLM(endog=y, exog=X, family=sm.families.Gaussian())
 
     # Use the same covariance matrix for both so that we can use tighter tolerances
@@ -2351,9 +2347,9 @@ def test_wald_test_feature_names(regression_data, names, R, r):
     X, y = regression_data
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X_df, y=y, store_covariance_matrix=True
+    )
 
     feature_names_results = mdl._wald_test_feature_names(names, r)
     if r is not None:
@@ -2392,9 +2388,9 @@ def test_wald_test_feature_names_public(regression_data, names, r):
     X, y = regression_data
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X_df, y=y, store_covariance_matrix=True
+    )
 
     assert mdl._wald_test_feature_names(names, r) == mdl.wald_test(features=names, r=r)
 
@@ -2449,7 +2445,7 @@ def test_wald_test_term_names(regression_data, names, R, r, r_feat):
     X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl._wald_test_term_names(names, r)
@@ -2515,7 +2511,7 @@ def test_wald_test_term_names_public(regression_data, names, R, r, r_feat):
     X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl.wald_test(terms=names, r=r)
@@ -2554,7 +2550,7 @@ def test_wald_test_formula(regression_data, formula, R, r_feat):
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl._wald_test_formula(formula)
@@ -2599,7 +2595,7 @@ def test_wald_test_formula_public(regression_data, formula, R, r_feat):
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl.wald_test(formula=formula)
@@ -2617,7 +2613,7 @@ def test_wald_test_formula_public(regression_data, formula, R, r_feat):
 
 def test_wald_test_raise_on_wrong_input(regression_data):
     X, y = regression_data
-    mdl = GeneralizedLinearRegressor(alpha=0, family="gaussian", fit_intercept=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True)
     mdl.fit(X=X, y=y)
 
     with pytest.raises(ValueError):
@@ -2632,7 +2628,6 @@ def test_wald_test_raise_on_wrong_input(regression_data):
 @pytest.mark.parametrize("weighted", [False, True])
 def test_score_method(as_data_frame, offset, weighted):
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -2666,7 +2661,7 @@ def test_score_method(as_data_frame, offset, weighted):
 
 def test_information_criteria(regression_data):
     X, y = regression_data
-    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor = GeneralizedLinearRegressor(family="gaussian")
     regressor.fit(X, y)
 
     llf = regressor.family_instance.log_likelihood(y, regressor.predict(X))
@@ -2721,10 +2716,10 @@ def test_drop_first_allows_alpha_equals_0():
     rng = np.random.default_rng(42)
     y = np.random.normal(size=10)
     X = pd.DataFrame(data={"cat": pd.Categorical(rng.integers(2, size=10))})
-    regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
+    regressor = GeneralizedLinearRegressor(drop_first=True)
     regressor.fit(X, y)
 
-    regressor = GeneralizedLinearRegressor(alpha=0)  # default is False
+    regressor = GeneralizedLinearRegressor()  # default is False
     with pytest.raises(np.linalg.LinAlgError):
         regressor.fit(X, y)
 
@@ -2732,7 +2727,7 @@ def test_drop_first_allows_alpha_equals_0():
 def test_dropping_distinct_categorical_column():
     y = np.random.normal(size=10)
     X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10)), "num": np.ones(10)})
-    regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
+    regressor = GeneralizedLinearRegressor(drop_first=True)
     regressor.fit(X, y)
     assert regressor.coef_.shape == (1,)
     assert regressor.feature_names_ == ["num"]
@@ -2769,7 +2764,6 @@ def test_store_covariance_matrix(
 
     regressor = GeneralizedLinearRegressor(
         family="gaussian",
-        alpha=0,
         robust=robust,
         expected_information=expected_information,
     )
@@ -2804,7 +2798,6 @@ def test_store_covariance_matrix_formula(regression_data, formula):
     regressor = GeneralizedLinearRegressor(
         formula=formula,
         family="gaussian",
-        alpha=0,
     )
     regressor.fit(df, y, store_covariance_matrix=True)
 
@@ -2827,7 +2820,6 @@ def test_store_covariance_matrix_formula_errors(regression_data):
     regressor = GeneralizedLinearRegressor(
         formula=formula,
         family="gaussian",
-        alpha=0,
     )
     regressor.fit(df, y)
     with pytest.raises(ValueError, match="Either X and y must be provided"):
@@ -2837,7 +2829,7 @@ def test_store_covariance_matrix_formula_errors(regression_data):
 def test_store_covariance_matrix_errors(regression_data):
     X, y = regression_data
 
-    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor = GeneralizedLinearRegressor(family="gaussian")
     regressor.fit(X, y, store_covariance_matrix=False)
 
     with pytest.raises(ValueError, match="Either X and y must be provided"):
@@ -3151,7 +3143,6 @@ def test_formula_against_smf(get_mixed_data, formula, fit_intercept):
         family="normal",
         drop_first=True,
         formula=formula,
-        alpha=0.0,
         fit_intercept=fit_intercept,
     ).fit(data)
 
@@ -3174,7 +3165,6 @@ def test_formula_context(get_mixed_data):
         family="normal",
         drop_first=True,
         formula=formula,
-        alpha=0.0,
         fit_intercept=True,
     ).fit(data)
 
@@ -3206,7 +3196,6 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):
         family="normal",
         drop_first=True,
         formula=formula,
-        alpha=0.0,
         fit_intercept=fit_intercept,
     ).fit(data)
 

From e5ebcf2174d453d19cde829ae355895736d6a7d2 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 17:34:19 +0100
Subject: [PATCH 02/10] fix docstring

---
 src/glum/_glm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 1dd9cb010..b5ad316ce 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -3146,9 +3146,9 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
         Set this to True when ``alpha=0`` and ``solver='auto'`` to prevent an error
-        due to a singular feature matrix. In the case of using a formula with 
-        interactions, setting this argument to ``True`` ensures structural 
-        full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and 
+        due to a singular feature matrix. In the case of using a formula with
+        interactions, setting this argument to ``True`` ensures structural
+        full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and
         tabmat).
 
     robust : bool, optional (default = False)

From 55d9bb5617294d151b9a6a787a70da88d0b98779 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 19:07:06 +0100
Subject: [PATCH 03/10] add alpha where needed to avoid LinAlgError

---
 tests/glm/test_glm.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index c742c5a7f..fe552d793 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -430,7 +430,7 @@ def test_glm_fit_intercept_argument(estimator, fit_intercept):
 )
 def test_glm_solver_argument(estimator, solver, l1_ratio, y, X):
     """Test GLM for invalid solver argument."""
-    glm = estimator(solver=solver, l1_ratio=l1_ratio)
+    glm = estimator(solver=solver, l1_ratio=l1_ratio, alpha=1.0)
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
@@ -479,7 +479,10 @@ def test_glm_warm_start_argument(estimator, warm_start):
 def test_glm_warm_start_with_constant_column(estimator):
     X, y = make_regression()
     X[:, 0] = 0
-    glm = estimator(warm_start=True)
+    kwargs = {"warm_start": True}
+    if estimator == GeneralizedLinearRegressor:
+        kwargs["alpha"] = 1.0
+    glm = estimator(**kwargs)
     glm.fit(X, y)
     glm.fit(X, y)
 
@@ -1310,11 +1313,11 @@ def test_binomial_enet(alpha):
 @pytest.mark.parametrize(
     "params",
     [
-        {"solver": "irls-ls"},
-        {"solver": "lbfgs"},
-        {"solver": "trust-constr"},
-        {"solver": "irls-cd", "selection": "cyclic"},
-        {"solver": "irls-cd", "selection": "random"},
+        {"solver": "irls-ls", "alpha": 1.0},
+        {"solver": "lbfgs", "alpha": 1.0},
+        {"solver": "trust-constr", "alpha": 1.0},
+        {"solver": "irls-cd", "selection": "cyclic", "alpha": 1.0},
+        {"solver": "irls-cd", "selection": "random", "alpha": 1.0},
     ],
     ids=lambda params: ", ".join(f"{key}={val}" for key, val in params.items()),
 )
@@ -1326,7 +1329,7 @@ def test_solver_equivalence(params, use_offset, regression_data):
         offset = np.random.random(len(y))
     else:
         offset = None
-    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref = GeneralizedLinearRegressor(random_state=2, alpha=1.0)
     est_ref.fit(X, y, offset=offset)
 
     est_2 = GeneralizedLinearRegressor(**params)
@@ -1801,7 +1804,7 @@ def test_passing_noncontiguous_as_X():
 )
 def test_feature_names_underscores(X, feature_names):
     model = GeneralizedLinearRegressor(
-        family="poisson", categorical_format="{name}__{category}"
+        family="poisson", categorical_format="{name}__{category}", alpha=1.0
     ).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
 
@@ -1848,7 +1851,7 @@ def test_feature_names_underscores(X, feature_names):
 )
 def test_feature_names_brackets(X, feature_names):
     model = GeneralizedLinearRegressor(
-        family="poisson", categorical_format="{name}[{category}]"
+        family="poisson", categorical_format="{name}[{category}]", alpha=1.0
     ).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
 
@@ -1889,7 +1892,7 @@ def test_feature_names_brackets(X, feature_names):
     ],
 )
 def test_term_names(X, term_names):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "term_names_", None), term_names)
 
 
@@ -1905,7 +1908,7 @@ def test_term_names(X, term_names):
     ],
 )
 def test_feature_dtypes(X, dtypes):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_dtypes_", None), dtypes)
 
 
@@ -1928,12 +1931,12 @@ def test_categorical_types(k, n):
 
     # use categorical types
     X_cat = pd.DataFrame({"group": pd.Categorical(group, categories=categories)})
-    model_cat = GeneralizedLinearRegressor(family="poisson").fit(X_cat, y)
+    model_cat = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_cat, y)
     pred_cat = model_cat.predict(X_cat)
 
     # use one-hot encoding
     X_oh = pd.get_dummies(X_cat, dtype=float)
-    model_oh = GeneralizedLinearRegressor(family="poisson").fit(X_oh, y)
+    model_oh = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_oh, y)
     pred_oh = model_oh.predict(X_oh)
 
     # check predictions
@@ -3014,6 +3017,7 @@ def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
         formula=formula,
         fit_intercept=fit_intercept,
         categorical_format="{name}[T.{category}]",
+        alpha=1.0,
     ).fit(data)
 
     if fit_intercept:
@@ -3033,6 +3037,7 @@ def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
         drop_first=drop_first,
         fit_intercept=fit_intercept,
         categorical_format="{name}[T.{category}]",
+        alpha=1.0,
     ).fit(X_ext, y_ext)
 
     np.testing.assert_almost_equal(model_ext.coef_, model_formula.coef_)
@@ -3083,6 +3088,7 @@ def test_formula_names_formulaic_style(
         formula=formula,
         categorical_format="{name}[T.{category}]",
         interaction_separator=":",
+        alpha=1.0,
     ).fit(data)
 
     np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
@@ -3120,6 +3126,7 @@ def test_formula_names_old_glum_style(
         formula=formula,
         categorical_format="{name}__{category}",
         interaction_separator="__x__",
+        alpha=1.0,
     ).fit(data)
 
     np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
@@ -3234,6 +3241,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         drop_first=False,
         formula=formula,
         fit_intercept=False,
+        alpha=1.0,
     )
     if cat_missing_method == "fail" and not unseen_missing:
         with pytest.raises(

From e82f59731d32c3862f262502b336cf304bb2162e Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 19:12:06 +0100
Subject: [PATCH 04/10] add changelog entry

---
 CHANGELOG.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 72c6933d1..0f7b30903 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,6 +10,10 @@ Changelog
 3.0.0 - UNRELEASED
 ------------------
 
+**Breaking change:**
+
+- :class:`~glum.GeneralizedLinearRegressor`'s default value for `alpha` is now `0`, i.e. no regularization.
+
 **New features:**
 
 - Added a formula interface for specifying models.

From d70e2a655acd2d2c1112fe148761e0b12c8858b4 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 19:19:17 +0100
Subject: [PATCH 05/10] also set alpha in golden master

---
 tests/glm/test_glm.py           |  5 ++++-
 tests/glm/test_golden_master.py | 19 +++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index fe552d793..080802123 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -430,7 +430,10 @@ def test_glm_fit_intercept_argument(estimator, fit_intercept):
 )
 def test_glm_solver_argument(estimator, solver, l1_ratio, y, X):
     """Test GLM for invalid solver argument."""
-    glm = estimator(solver=solver, l1_ratio=l1_ratio, alpha=1.0)
+    kwargs = {"solver": solver, "l1_ratio": l1_ratio}
+    if estimator == GeneralizedLinearRegressor:
+        kwargs["alpha"] = 1.0
+    glm = estimator(**kwargs)
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index ac00f91b2..8626256d5 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -100,25 +100,32 @@ def expected_all():
 
 
 gm_model_parameters = {
-    "default": {},  # default params
-    "half-regularization": {"alpha": 0.5},  # regularization (other than alpha = 1)
-    "elastic-net": {"l1_ratio": 0.5},  # elastic-net
-    "lasso": {"l1_ratio": 1},  # lasso
+    "regularization": {"alpha": 1.0},  # regularization with alpha = 1
+    "half-regularization": {"alpha": 0.5},  # regularization with alpha = 0
+    "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0},  # elastic-net
+    "lasso": {"l1_ratio": 1, "alpha": 1.0},  # lasso
     "variable_p1": {
         "l1_ratio": 1,
         "P1": np.arange(30) / 10,
+        "alpha": 1.0,
     },  # lasso with variable penalty
     "variable_p2": {
         "l1_ratio": 0,
         "P2": _make_P2(),
+        "alpha": 1.0,
     },  # ridge with Tikhonov regularization
     "variable_p1_p2": {
         "l1_ratio": 0.5,
         "P1": np.arange(30) / 10,
         "P2": _make_P2(),
+        "alpha": 1.0,
     },  # elastic net with P1 and P2 variable penalty
-    "fit_intercept": {"fit_intercept": False},  # do not fit the intercept
-    "bounds": {"lower_bounds": np.full(30, 0), "upper_bounds": np.full(30, 0.4)},
+    "fit_intercept": {"fit_intercept": False, "alpha": 1.0},  # do not fit the intercept
+    "bounds": {
+        "lower_bounds": np.full(30, 0),
+        "upper_bounds": np.full(30, 0.4),
+        "alpha": 1.0,
+    },
 }
 
 

From 31b0b2e497c2f1e6ec0c5a623a473b5080e788cf Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 19:34:33 +0100
Subject: [PATCH 06/10] change name in persisted file too

---
 tests/glm/golden_master/simulation_gm.json | 40 +++++++++++-----------
 tests/glm/test_golden_master.py            |  1 +
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/tests/glm/golden_master/simulation_gm.json b/tests/glm/golden_master/simulation_gm.json
index 3c69f59bf..660413de6 100644
--- a/tests/glm/golden_master/simulation_gm.json
+++ b/tests/glm/golden_master/simulation_gm.json
@@ -1,6 +1,6 @@
 {
   "normal": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.5027665204024282,
         0.23449539956055546,
@@ -36,7 +36,7 @@
       "intercept_": 3.026490229054092,
       "n_iter_": 1
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.5012056522046088,
         0.23528722263235485,
@@ -72,7 +72,7 @@
       "intercept_": 2.0279948791150764,
       "n_iter_": 1
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.49784759015593427,
         0.23166926058137094,
@@ -108,7 +108,7 @@
       "intercept_": 2.981778440705444,
       "n_iter_": 1
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.4985676422254175,
         0.22818569911229844,
@@ -1478,7 +1478,7 @@
     }
   },
   "poisson": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.9604408672344522,
         0.4432562524921413,
@@ -1514,7 +1514,7 @@
       "intercept_": 1.8189178943867188,
       "n_iter_": 6
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.9817372866211753,
         0.49117907395980553,
@@ -1550,7 +1550,7 @@
       "intercept_": 1.157828764208921,
       "n_iter_": 6
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.9693196874148616,
         0.46707910961062293,
@@ -1586,7 +1586,7 @@
       "intercept_": 1.8396971485658087,
       "n_iter_": 6
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.9821298947770232,
         0.4937841900606277,
@@ -2812,7 +2812,7 @@
     }
   },
   "gamma": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.4866808417045077,
         0.1370793228217412,
@@ -2848,7 +2848,7 @@
       "intercept_": 5.268950639816242,
       "n_iter_": 4
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.48972345202083134,
         0.24707128799109493,
@@ -2884,7 +2884,7 @@
       "intercept_": 2.512993119536852,
       "n_iter_": 4
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.5107634971640694,
         0.1783139942111257,
@@ -2920,7 +2920,7 @@
       "intercept_": 5.272870219406924,
       "n_iter_": 4
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.4966531683982075,
         0.24896254652599858,
@@ -4146,7 +4146,7 @@
     }
   },
   "tweedie_p=1.5": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.8740584736837378,
         0.39026903329437757,
@@ -4182,7 +4182,7 @@
       "intercept_": 2.8380327257627473,
       "n_iter_": 4
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.8592854961617753,
         0.42694459825027725,
@@ -4218,7 +4218,7 @@
       "intercept_": 1.6496674803774887,
       "n_iter_": 4
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.8763610403720393,
         0.4023951463085115,
@@ -4254,7 +4254,7 @@
       "intercept_": 2.7855262434295343,
       "n_iter_": 4
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.860178238544325,
         0.43000049156945763,
@@ -5480,7 +5480,7 @@
     }
   },
   "binomial": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.0645115293284631,
         0.03563706184469416,
@@ -5516,7 +5516,7 @@
       "intercept_": 3.3761974509366994,
       "n_iter_": 3
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.06396142685405831,
         0.03544619397195947,
@@ -5552,7 +5552,7 @@
       "intercept_": 2.007458821879875,
       "n_iter_": 2
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.059850128940604715,
         0.029620907232596274,
@@ -5588,7 +5588,7 @@
       "intercept_": 3.4202998674202676,
       "n_iter_": 3
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.05979957149348005,
         0.03233408720147587,
diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index 8626256d5..dc6d5646a 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -100,6 +100,7 @@ def expected_all():
 
 
 gm_model_parameters = {
+    # TODO add an unregularized case
     "regularization": {"alpha": 1.0},  # regularization with alpha = 1
     "half-regularization": {"alpha": 0.5},  # regularization with alpha = 0
     "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0},  # elastic-net

From 643b231a8aa54f4c710eba49343ec694014e1e32 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 19:45:54 +0100
Subject: [PATCH 07/10] set alpha in model_parameters again

---
 tests/glm/test_golden_master.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index dc6d5646a..716edfff9 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -215,13 +215,13 @@ def test_gm_storage(distribution, data_all_storage, expected_all):
     model = fit_model(
         data=data,
         family=distribution,
-        model_parameters={},
+        model_parameters={"alpha": 1.0},
         use_weights=False,
         use_offset=False,
         cv=False,
     )
 
-    run_name = "default"
+    run_name = "regularization"
     expected = expected_all[distribution][run_name]
 
     assert_gm_allclose(model, expected)
@@ -234,7 +234,7 @@ def test_gm_custom_link(family_link, use_weights, use_offset, data_all, expected
     """Currently only testing log-linear model."""
     distribution, link = family_link
     data = data_all[distribution]
-    model_parameters = {"link": link}
+    model_parameters = {"link": link, "alpha": 1.0}
     model = fit_model(
         data=data,
         family=distribution,
@@ -265,9 +265,7 @@ def test_gm_approx_hessian(
     distribution, use_weights, use_offset, data_all, expected_all
 ):
     data = data_all[distribution]
-    model_parameters = {
-        "hessian_approx": 0.1,
-    }
+    model_parameters = {"hessian_approx": 0.1, "alpha": 1.0}
     model = fit_model(
         data=data,
         family=distribution,
@@ -277,7 +275,7 @@ def test_gm_approx_hessian(
         cv=False,
     )
 
-    run_name = "default"
+    run_name = "regularization"
     if use_weights:
         run_name = f"{run_name}_weights"
     if use_offset:
@@ -298,6 +296,7 @@ def test_gm_cv(distribution, data_all, expected_all):
         "alphas": [0.1, 0.05, 0.01],
         "l1_ratio": [0.2, 0.5, 0.9],
         "cv": 3,
+        "alpha": 1.0,
     }
     model = fit_model(
         data=data,
@@ -453,7 +452,7 @@ def run_and_store_golden_master(
             for use_offset in [True, False]:
                 gm_dict = run_and_store_golden_master(
                     distribution=dist,
-                    model_parameters={"link": link},
+                    model_parameters={"link": link, "alpha": 1.0},
                     run_name=f"custom-{dist}-{link}",
                     use_weights=use_weights,
                     use_offset=use_offset,

From 86eeb5e9146bfb1c5f258e6a5ad9555d9f9b02e6 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 20:04:59 +0100
Subject: [PATCH 08/10] don't modify case of no alpha attribute, which is
 RegressorCV

---
 src/glum/_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index b5ad316ce..9c9fb53d8 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -940,7 +940,7 @@ def _set_up_for_fit(self, y: np.ndarray) -> None:
             elif (self.lower_bounds is None) and (self.upper_bounds is None):
                 if np.all(np.asarray(self.l1_ratio) == 0):
                     self._solver = "irls-ls"
-                elif getattr(self, "alpha", 0) == 0 and not self.alpha_search:
+                elif getattr(self, "alpha", 1) == 0 and not self.alpha_search:
                     self._solver = "irls-ls"
                 else:
                     self._solver = "irls-cd"

From 012c90402ffcb177d0146dbae2eca9dffe3ab110 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 20:05:56 +0100
Subject: [PATCH 09/10] remove invalid alpha argument

---
 tests/glm/test_golden_master.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index 716edfff9..d58a45f68 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -296,7 +296,6 @@ def test_gm_cv(distribution, data_all, expected_all):
         "alphas": [0.1, 0.05, 0.01],
         "l1_ratio": [0.2, 0.5, 0.9],
         "cv": 3,
-        "alpha": 1.0,
     }
     model = fit_model(
         data=data,

From f9052c92e45915ab28eeb1a112e4ff099e588cd2 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Thu, 1 Feb 2024 09:04:41 +0100
Subject: [PATCH 10/10] wording

---
 tests/glm/test_golden_master.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index d58a45f68..4f70c6a7e 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -101,8 +101,8 @@ def expected_all():
 
 gm_model_parameters = {
     # TODO add an unregularized case
-    "regularization": {"alpha": 1.0},  # regularization with alpha = 1
-    "half-regularization": {"alpha": 0.5},  # regularization with alpha = 0
+    "regularization": {"alpha": 1.0},  # default prior to v3
+    "half-regularization": {"alpha": 0.5},  # regularization with alpha = 0.5
     "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0},  # elastic-net
     "lasso": {"l1_ratio": 1, "alpha": 1.0},  # lasso
     "variable_p1": {