From 7f2bbb00011f165e9d62955462b721460c683ac7 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 25 Jul 2023 10:18:51 +0200
Subject: [PATCH 01/63] Make tests green with densematrix-refactor branch

---
 src/glum/_distribution.py      | 12 +++++-------
 src/glum/_glm.py               | 18 ++++++++----------
 tests/glm/test_distribution.py |  4 ++--
 tests/glm/test_glm.py          |  2 +-
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
index 15132a40..0f883b05 100644
--- a/src/glum/_distribution.py
+++ b/src/glum/_distribution.py
@@ -9,6 +9,7 @@
     CategoricalMatrix,
     DenseMatrix,
     MatrixBase,
+    SparseMatrix,
     SplitMatrix,
     StandardizedMatrix,
 )
@@ -534,17 +535,14 @@ def _score_matrix(self, link, X, y, mu, sample_weight, dispersion, fit_intercept
         ).reshape(-1, 1)
 
         if fit_intercept:
-            if sparse.issparse(X):
-                return sparse.hstack((W, X.multiply(W)))
+            if isinstance(X, SparseMatrix):
+                return SparseMatrix(sparse.hstack((W, X.multiply(W).array_csc)))
             elif isinstance(X, (SplitMatrix, CategoricalMatrix)):
                 return SplitMatrix((DenseMatrix(W), X.multiply(W)))
             else:
-                return np.hstack((W, np.multiply(X, W)))
+                return DenseMatrix(np.hstack((W, X.multiply(W)._array)))
         else:
-            if sparse.issparse(X) or isinstance(X, (SplitMatrix, CategoricalMatrix)):
-                return X.multiply(W)
-            else:
-                return np.multiply(X, W)
+            return X.multiply(W)
 
     def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
         r"""Estimate the dispersion parameter :math:`\phi`.
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index db8a2086..e44d51c0 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -110,7 +110,10 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar
         )
 
     original_type = type(mat)
-    res = check_array(mat, **kwargs)
+    if isinstance(mat, (tm.DenseMatrix, tm.SparseMatrix)):
+        res = check_array(mat._array, **kwargs)
+    else:
+        res = check_array(mat, **kwargs)
 
     if res is not mat and original_type in (tm.DenseMatrix, tm.SparseMatrix):
         res = original_type(res)  # type: ignore
@@ -664,19 +667,17 @@ def is_pos_semidef(p: Union[sparse.spmatrix, np.ndarray]) -> Union[bool, np.bool
     return np.all(eigenvalues >= epsneg)
 
 
-def _group_sum(groups: np.ndarray, data: np.ndarray):
+def _group_sum(groups: np.ndarray, data: tm.MatrixBase):
     """Sum over groups."""
     ngroups = len(np.unique(groups))
     out = np.empty((ngroups, data.shape[1]))
-    if sparse.issparse(data) or isinstance(
-        data, (tm.SplitMatrix, tm.CategoricalMatrix)
-    ):
+    if isinstance(data, (tm.SparseMatrix, tm.SplitMatrix, tm.CategoricalMatrix)):
         eye_n = np.eye(ngroups)[:, groups]
         for i in range(data.shape[1]):
             out[:, i] = (eye_n @ data.getcol(i)).ravel()
     else:
         for i in range(data.shape[1]):
-            out[:, i] = np.bincount(groups, weights=data[:, i])
+            out[:, i] = np.bincount(groups, weights=data[:, i]._array.squeeze())
     return out
 
 
@@ -1530,10 +1531,7 @@ def covariance_matrix(
                     / (sum_weights - self.n_features_in_ - int(self.fit_intercept))
                 )
             else:
-                if isinstance(gradient, tm.SplitMatrix):
-                    inner_part = gradient.sandwich(np.ones_like(y))
-                else:
-                    inner_part = gradient.T @ gradient
+                inner_part = gradient.sandwich(np.ones_like(y))
                 correction = sum_weights / (
                     sum_weights - self.n_features_in_ - int(self.fit_intercept)
                 )
diff --git a/tests/glm/test_distribution.py b/tests/glm/test_distribution.py
index 6ac2eef5..d241ff07 100644
--- a/tests/glm/test_distribution.py
+++ b/tests/glm/test_distribution.py
@@ -234,7 +234,7 @@ def test_hessian_matrix(family, link, true_hessian):
     dispersion = 0.5
     rng = np.random.RandomState(42)
     X = tm.DenseMatrix(rng.randn(10, 5))
-    lin_pred = np.dot(X, coef)
+    lin_pred = X.matvec(coef)
     mu = link.inverse(lin_pred)
     sample_weight = rng.randn(10) ** 2 + 1
     _, hessian_rows = family.rowwise_gradient_hessian(
@@ -257,7 +257,7 @@ def test_hessian_matrix(family, link, true_hessian):
     for i in range(coef.shape[0]):
 
         def f(coef):
-            this_eta = X.dot(coef)
+            this_eta = X.matvec(coef)
             this_mu = link.inverse(this_eta)
             yv = mu
             if true_hessian:
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index fd36dd6b..2e3d74b9 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -1402,7 +1402,7 @@ def _arrays_share_data(arr1: np.ndarray, arr2: np.ndarray) -> bool:
         assert _arrays_share_data(X.mat.indptr, M.indptr)
     else:
         # Check that the underlying data pointer is the same
-        assert _arrays_share_data(X.mat, M)
+        assert _arrays_share_data(X.mat._array, M._array)
     np.testing.assert_almost_equal(col_means, col_mults)
 
     # After standardization, all the columns will have the same values.

From cc58bbe8a2f4e419e4869fdc2ac1f46d5b6f65b3 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 26 Jul 2023 12:31:56 +0200
Subject: [PATCH 02/63] Remove most Matrixbase subclass checks

---
 src/glum/_distribution.py | 22 +++++-----------------
 src/glum/_glm.py          | 15 ++++++---------
 tests/glm/test_glm.py     |  2 +-
 3 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
index 0f883b05..53e80019 100644
--- a/src/glum/_distribution.py
+++ b/src/glum/_distribution.py
@@ -4,15 +4,8 @@
 
 import numexpr
 import numpy as np
-from scipy import sparse, special
-from tabmat import (
-    CategoricalMatrix,
-    DenseMatrix,
-    MatrixBase,
-    SparseMatrix,
-    SplitMatrix,
-    StandardizedMatrix,
-)
+from scipy import special
+from tabmat import MatrixBase, StandardizedMatrix, hstack
 
 from ._functions import (
     binomial_logit_eta_mu_deviance,
@@ -533,16 +526,11 @@ def _score_matrix(self, link, X, y, mu, sample_weight, dispersion, fit_intercept
             * link.inverse_derivative(linpred)
             * (y - mu)
         ).reshape(-1, 1)
-
+        XW = X.multiply(W)
         if fit_intercept:
-            if isinstance(X, SparseMatrix):
-                return SparseMatrix(sparse.hstack((W, X.multiply(W).array_csc)))
-            elif isinstance(X, (SplitMatrix, CategoricalMatrix)):
-                return SplitMatrix((DenseMatrix(W), X.multiply(W)))
-            else:
-                return DenseMatrix(np.hstack((W, X.multiply(W)._array)))
+            return hstack((W, XW))
         else:
-            return X.multiply(W)
+            return XW
 
     def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
         r"""Estimate the dispersion parameter :math:`\phi`.
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index e44d51c0..6aa14908 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -111,7 +111,7 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar
 
     original_type = type(mat)
     if isinstance(mat, (tm.DenseMatrix, tm.SparseMatrix)):
-        res = check_array(mat._array, **kwargs)
+        res = check_array(mat.toarray(), **kwargs)
     else:
         res = check_array(mat, **kwargs)
 
@@ -671,13 +671,13 @@ def _group_sum(groups: np.ndarray, data: tm.MatrixBase):
     """Sum over groups."""
     ngroups = len(np.unique(groups))
     out = np.empty((ngroups, data.shape[1]))
-    if isinstance(data, (tm.SparseMatrix, tm.SplitMatrix, tm.CategoricalMatrix)):
-        eye_n = np.eye(ngroups)[:, groups]
+    if isinstance(data, tm.DenseMatrix):
         for i in range(data.shape[1]):
-            out[:, i] = (eye_n @ data.getcol(i)).ravel()
+            out[:, i] = np.bincount(groups, weights=data.getcol(i).toarray().squeeze())
     else:
+        eye_n = np.eye(ngroups)[:, groups]
         for i in range(data.shape[1]):
-            out[:, i] = np.bincount(groups, weights=data[:, i]._array.squeeze())
+            out[:, i] = (eye_n @ data.getcol(i)).ravel()
     return out
 
 
@@ -1866,10 +1866,7 @@ def _expand_categorical_penalties(penalty, X, drop_first):
         #######################################################################
         # 2b. convert to wrapper matrix types
         #######################################################################
-        if sparse.issparse(X) and not isinstance(X, tm.SparseMatrix):
-            X = tm.SparseMatrix(X)
-        elif isinstance(X, np.ndarray):
-            X = tm.DenseMatrix(X)
+        X = tm.as_tabmat(X)
 
         return X, y, sample_weight, offset, weights_sum, P1, P2
 
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 2e3d74b9..7db3dbcc 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -1402,7 +1402,7 @@ def _arrays_share_data(arr1: np.ndarray, arr2: np.ndarray) -> bool:
         assert _arrays_share_data(X.mat.indptr, M.indptr)
     else:
         # Check that the underlying data pointer is the same
-        assert _arrays_share_data(X.mat._array, M._array)
+        assert _arrays_share_data(X.mat.toarray(), M.toarray())
     np.testing.assert_almost_equal(col_means, col_mults)
 
     # After standardization, all the columns will have the same values.

From cdb564e466d30bd83eab92183cfe0d6ee4eaa34d Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 26 Jul 2023 14:32:38 +0200
Subject: [PATCH 03/63] Simplify _group_sum

---
 src/glum/_glm.py      | 13 +++++--------
 tests/glm/test_glm.py |  2 +-
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 6aa14908..3ca8aedc 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 import pandas as pd
+import scipy.sparse as sps
 import scipy.sparse.linalg as splinalg
 import tabmat as tm
 from scipy import linalg, sparse
@@ -111,7 +112,7 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar
 
     original_type = type(mat)
     if isinstance(mat, (tm.DenseMatrix, tm.SparseMatrix)):
-        res = check_array(mat.toarray(), **kwargs)
+        res = check_array(mat.unpack(), **kwargs)
     else:
         res = check_array(mat, **kwargs)
 
@@ -671,13 +672,9 @@ def _group_sum(groups: np.ndarray, data: tm.MatrixBase):
     """Sum over groups."""
     ngroups = len(np.unique(groups))
     out = np.empty((ngroups, data.shape[1]))
-    if isinstance(data, tm.DenseMatrix):
-        for i in range(data.shape[1]):
-            out[:, i] = np.bincount(groups, weights=data.getcol(i).toarray().squeeze())
-    else:
-        eye_n = np.eye(ngroups)[:, groups]
-        for i in range(data.shape[1]):
-            out[:, i] = (eye_n @ data.getcol(i)).ravel()
+    eye_n = sps.eye(ngroups, format="csc")[:, groups]
+    for i in range(data.shape[1]):
+        out[:, i] = _safe_toarray(eye_n @ data.getcol(i).unpack()).ravel()
     return out
 
 
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 7db3dbcc..3890d17f 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -1402,7 +1402,7 @@ def _arrays_share_data(arr1: np.ndarray, arr2: np.ndarray) -> bool:
         assert _arrays_share_data(X.mat.indptr, M.indptr)
     else:
         # Check that the underlying data pointer is the same
-        assert _arrays_share_data(X.mat.toarray(), M.toarray())
+        assert _arrays_share_data(X.mat.unpack(), M.unpack())
     np.testing.assert_almost_equal(col_means, col_mults)
 
     # After standardization, all the columns will have the same values.

From 6b66d57e5e16dfc93dbc039b53874c270850061b Mon Sep 17 00:00:00 2001
From: "quant-ranger[bot]"
 <132915763+quant-ranger[bot]@users.noreply.github.com>
Date: Mon, 7 Aug 2023 06:56:52 +0100
Subject: [PATCH 04/63] Pre-commit autoupdate (#672)

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 149a9f59..4f1d68ec 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
          - --safe
          - --target-version=py37
  - repo: https://github.com/Quantco/pre-commit-mirrors-flake8
-   rev: 6.0.0
+   rev: 6.1.0
    hooks:
     - id: flake8-conda
       additional_dependencies: [

From bc6a5ec28fc0cc6495573611cde45aacdda905e3 Mon Sep 17 00:00:00 2001
From: Jan Tilly <jan.tilly@quantco.com>
Date: Mon, 7 Aug 2023 18:20:53 +0200
Subject: [PATCH 05/63] Use boa in CI. (#673)

---
 .github/workflows/conda-build.sh       | 4 ++--
 .github/workflows/macos-conda-build.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/conda-build.sh b/.github/workflows/conda-build.sh
index 71ec92f4..f93b0ad9 100755
--- a/.github/workflows/conda-build.sh
+++ b/.github/workflows/conda-build.sh
@@ -7,5 +7,5 @@ export CONDA_BUILD_YML=$1
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 source ${SCRIPT_DIR}/base.sh $*
 conda activate base
-mamba install -y conda-build
-conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe
+mamba install -y boa conda-build
+conda mambabuild -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe
diff --git a/.github/workflows/macos-conda-build.sh b/.github/workflows/macos-conda-build.sh
index 6fbd9ce9..598f8cf4 100755
--- a/.github/workflows/macos-conda-build.sh
+++ b/.github/workflows/macos-conda-build.sh
@@ -6,7 +6,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 source ${SCRIPT_DIR}/base.sh $*
 conda activate base
 
-mamba install -y conda-build -c conda-forge
+mamba install -y conda-build boa -c conda-forge
 # Don't test cross-compiled result (there is no emulation) and use the latest MacOS SDK.
 if grep -q "osx-arm64" .ci_support/${CONDA_BUILD_YML}.yaml; then
   CONDA_BUILD_ARGS="--no-test"
@@ -16,4 +16,4 @@ CONDA_BUILD_SYSROOT:
  - "${CONDA_BUILD_SYSROOT}"
 EOF
 fi
-conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe ${CONDA_BUILD_ARGS:-}
+conda mambabuild -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe ${CONDA_BUILD_ARGS:-}

From 2b8ae3b1c281773a304f1e38662fae7f8a1ba714 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 8 Aug 2023 12:47:27 +0200
Subject: [PATCH 06/63] Fix covariance matrix mutating feature names (#671)

* Do not use _set_up_... in covariance_matrix

* Add changelog entry
---
 CHANGELOG.rst    |  4 ++++
 src/glum/_glm.py | 37 +++++++++++++++++++++----------------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b63f3376..508b78e9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -14,6 +14,10 @@ Changelog
 
 - Added the complementary log-log (`cloglog`) link function.
 
+**Bug fix**
+
+- Fixed :meth:`~glum.GeneralizedLinearRegressorBase.covariance_matrix` mutating feature names when called with a data frame. See `here <https://github.com/Quantco/glum/issues/669>`_.
+
 **Other changes:**
 
 - When computing the covariance matrix, check for ill-conditionedness for all types of input. Furthermore, do it in a more efficient way.
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 3ca8aedc..4b4b7e7e 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1454,26 +1454,31 @@ def covariance_matrix(
            Cambridge university press
 
         """
-        (
-            X,
-            y,
-            sample_weight,
-            offset,
-            sum_weights,
-            P1,
-            P2,
-        ) = self._set_up_and_check_fit_args(
+
+        if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
+            X = _align_df_categories(X, self.feature_dtypes_)
+
+        X, y = check_X_y_tabmat_compliant(
             X,
             y,
-            sample_weight,
-            offset,
-            solver=self.solver,
-            force_all_finite=self.force_all_finite,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype="numeric",
+            copy=self._should_copy_X(),
+            ensure_2d=True,
+            allow_nd=False,
+            drop_first=self.drop_first,
         )
 
-        # Here we don't want sample_weight to be normalized to sum up to 1
-        # We want sample_weight to sum up to the number of samples
-        sample_weight = sample_weight * sum_weights
+        if isinstance(X, np.ndarray):
+            X = tm.DenseMatrix(X)
+        if sparse.issparse(X) and not isinstance(X, tm.SparseMatrix):
+            X = tm.SparseMatrix(X)
+
+        sample_weight = _check_weights(
+            sample_weight, y.shape[0], X.dtype, force_all_finite=self.force_all_finite
+        )
+        sum_weights = np.sum(sample_weight)
+        offset = _check_offset(offset, y.shape[0], X.dtype)
 
         mu = self.predict(X, offset=offset) if mu is None else np.asanyarray(mu)
 

From 95af4ffc48fba4d4ecfe210c7889e2baa9583c0f Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 8 Aug 2023 14:28:06 +0200
Subject: [PATCH 07/63] Add the option to store the covariance matrix to avoid
 recomputing it (#661)

* Add option to store covariance matrix during fit

* Fix fitting with variance matrix estimation

`.covariance_matrix()` expects X and weights in a different format than
what we have at the end of `.fit().

* Store covariance matrix after estimation

* Handle the alpha_search and glm_cv cases

* Propagate covariance parameters

* Add changelog

* Slightly more lenient tests
---
 CHANGELOG.rst         |   1 +
 src/glum/_glm.py      | 203 ++++++++++++++++++++++++++++++++++--------
 src/glum/_glm_cv.py   |  36 ++++++++
 tests/glm/test_glm.py | 131 +++++++++++++++++++++++++++
 4 files changed, 333 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 508b78e9..e45052e7 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -13,6 +13,7 @@ Changelog
 **New feature**
 
 - Added the complementary log-log (`cloglog`) link function.
+- Added the option to store the covariance matrix after estimating it. In this case, the covariance matrix does not have to be recomputed when calling inference methods.
 
 **Bug fix**
 
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 4b4b7e7e..94b0f13d 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -716,6 +716,8 @@ def __init__(
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
         drop_first: bool = False,
+        robust: bool = True,
+        expected_information: bool = False,
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -746,6 +748,8 @@ def __init__(
         self.b_ineq = b_ineq
         self.force_all_finite = force_all_finite
         self.drop_first = drop_first
+        self.robust = robust
+        self.expected_information = expected_information
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -1324,15 +1328,16 @@ def predict(
 
     def std_errors(
         self,
-        X,
-        y,
+        X=None,
+        y=None,
         mu=None,
         offset=None,
         sample_weight=None,
         dispersion=None,
-        robust=True,
+        robust=None,
         clusters: np.ndarray = None,
-        expected_information=False,
+        expected_information=None,
+        store_covariance_matrix=False,
     ):
         """Calculate standard errors for generalized linear models.
 
@@ -1353,14 +1358,19 @@ def std_errors(
             Individual weights for each sample.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=True
+        robust : boolean, optional, default=None
             Whether to compute robust standard errors instead of normal ones.
+            If not specified, the model's ``robust`` attribute is used.
         clusters : array-like, optional, default=None
             Array with clusters membership. Clustered standard errors are
             computed if clusters is not None.
-        expected_information : boolean, optional, default=False
+        expected_information : boolean, optional, default=None
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust std-errors.
+            If not specified, the model's ``expected_information`` attribute is used.
+        store_covariance_matrix : boolean, optional, default=False
+            Whether to store the covariance matrix in the model instance.
+            If a covariance matrix has already been stored, it will be overwritten.
         """
         return np.sqrt(
             self.covariance_matrix(
@@ -1373,29 +1383,34 @@ def std_errors(
                 robust=robust,
                 clusters=clusters,
                 expected_information=expected_information,
+                store_covariance_matrix=store_covariance_matrix,
             ).diagonal()
         )
 
     def covariance_matrix(
         self,
-        X,
-        y,
+        X=None,
+        y=None,
         mu=None,
         offset=None,
         sample_weight=None,
         dispersion=None,
-        robust=True,
-        clusters: np.ndarray = None,
-        expected_information=False,
+        robust=None,
+        clusters: Optional[np.ndarray] = None,
+        expected_information=None,
+        store_covariance_matrix=False,
+        skip_checks=False,
     ):
         """Calculate the covariance matrix for generalized linear models.
 
         Parameters
         ----------
-        X : {array-like, sparse matrix}, shape (n_samples, n_features)
-            Training data.
-        y : array-like, shape (n_samples,)
-            Target values.
+        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
+            Training data. Can be omitted if a covariance matrix has already
+            been computed.
+        y : array-like, shape (n_samples,), optional
+            Target values. Can be omitted if a covariance matrix has already
+            been computed.
         mu : array-like, optional, default=None
             Array with predictions. Estimated if absent.
         offset : array-like, optional, default=None
@@ -1404,14 +1419,21 @@ def covariance_matrix(
             Individual weights for each sample.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=True
+        robust : boolean, optional, default=None
             Whether to compute robust standard errors instead of normal ones.
+            If not specified, the model's ``robust`` attribute is used.
         clusters : array-like, optional, default=None
             Array with clusters membership. Clustered standard errors are
             computed if clusters is not None.
-        expected_information : boolean, optional, default=False
+        expected_information : boolean, optional, default=None
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
+            If not specified, the model's ``expected_information`` attribute is used.
+        store_covariance_matrix : boolean, optional, default=False
+            Whether to store the covariance matrix in the model instance.
+            If a covariance matrix has already been stored, it will be overwritten.
+        skip_checks : boolean, optional, default=False
+            Whether to skip input validation. For internal use only.
 
         Notes
         -----
@@ -1454,29 +1476,95 @@ def covariance_matrix(
            Cambridge university press
 
         """
+        self.covariance_matrix_: Union[np.ndarray, None]
 
-        if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
-            X = _align_df_categories(X, self.feature_dtypes_)
+        if robust is None:
+            _robust = self.robust
+        else:
+            _robust = robust
 
-        X, y = check_X_y_tabmat_compliant(
-            X,
-            y,
-            accept_sparse=["csr", "csc", "coo"],
-            dtype="numeric",
-            copy=self._should_copy_X(),
-            ensure_2d=True,
-            allow_nd=False,
-            drop_first=self.drop_first,
-        )
+        if expected_information is None:
+            _expected_information = self.expected_information
+        else:
+            _expected_information = expected_information
 
-        if isinstance(X, np.ndarray):
-            X = tm.DenseMatrix(X)
-        if sparse.issparse(X) and not isinstance(X, tm.SparseMatrix):
-            X = tm.SparseMatrix(X)
+        if (
+            (hasattr(self, "alpha") and self.alpha is None)
+            or (
+                hasattr(self, "alpha")
+                and isinstance(self.alpha, (int, float))
+                and self.alpha > 0
+            )
+            or (hasattr(self, "alpha_") and self.alpha_ > 0)  # glm_cv
+            or (hasattr(self, "_alphas") and self._alphas[-1] > 0)  # alpha_search
+        ):
+            warnings.warn(
+                "Covariance matrix estimation assumes that the model is not "
+                "penalized. You are estimating a penalized model. The covariance "
+                "matrix will be incorrect."
+            )
+
+        if not skip_checks:
+            if (X is None or y is None) and self.covariance_matrix_ is None:
+                raise ValueError(
+                    "Either X and y must be provided or the covariance matrix "
+                    "must have been previously computed."
+                )
+
+            if (X is None or y is None) and store_covariance_matrix:
+                raise ValueError(
+                    "X and y must be provided if 'store_covariance_matrix' is True."
+                )
+
+            if store_covariance_matrix and self.covariance_matrix_ is not None:
+                warnings.warn(
+                    "A covariance matrix has already been computed. "
+                    "It will be overwritten."
+                )
+
+            if X is None and y is None:
+                if (
+                    offset is not None
+                    or mu is not None
+                    or offset is not None
+                    or sample_weight is not None
+                    or dispersion is not None
+                    or robust is not None
+                    or clusters is not None
+                    or expected_information is not None
+                ):
+                    raise ValueError(
+                        "Cannot reestimate the covariance matrix with different "
+                        "parameters if X and y are not provided."
+                    )
+                return self.covariance_matrix_
+
+            if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
+                X = _align_df_categories(X, self.feature_dtypes_)
+
+            X, y = check_X_y_tabmat_compliant(
+                X,
+                y,
+                accept_sparse=["csr", "csc", "coo"],
+                dtype="numeric",
+                copy=self._should_copy_X(),
+                ensure_2d=True,
+                allow_nd=False,
+                drop_first=self.drop_first,
+            )
+
+            if isinstance(X, np.ndarray):
+                X = tm.DenseMatrix(X)
+            if sparse.issparse(X) and not isinstance(X, tm.SparseMatrix):
+                X = tm.SparseMatrix(X)
+
+            sample_weight = _check_weights(
+                sample_weight,
+                y.shape[0],
+                X.dtype,
+                force_all_finite=self.force_all_finite,
+            )
 
-        sample_weight = _check_weights(
-            sample_weight, y.shape[0], X.dtype, force_all_finite=self.force_all_finite
-        )
         sum_weights = np.sum(sample_weight)
         offset = _check_offset(offset, y.shape[0], X.dtype)
 
@@ -1501,8 +1589,8 @@ def covariance_matrix(
                 "Matrix is singular. Cannot estimate standard errors."
             )
 
-        if robust or clusters is not None:
-            if expected_information:
+        if _robust or clusters is not None:
+            if _expected_information:
                 oim_fct = self._family_instance._fisher_information
             else:
                 oim_fct = self._family_instance._observed_information
@@ -1554,6 +1642,9 @@ def covariance_matrix(
                 sum_weights - self.n_features_in_ - int(self.fit_intercept)
             )
 
+        if store_covariance_matrix:
+            self.covariance_matrix_ = vcov
+
         return vcov
 
     # Note: check_estimator(GeneralizedLinearRegressor) might raise
@@ -2148,6 +2239,13 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         Set this to True when alpha=0 and solver='auto' to prevent an error due to a singular
         feature matrix.
 
+    robust : bool, optional (default = False)
+        If true, then robust standard errors are computed by default.
+
+    expected_information : bool, optional (default = False)
+        If true, then the expected information matrix is computed by default.
+        Only relevant when computing robust standard errors.
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -2229,6 +2327,8 @@ def __init__(
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
         drop_first: bool = False,
+        robust: bool = True,
+        expected_information: bool = False,
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -2262,6 +2362,8 @@ def __init__(
             b_ineq=b_ineq,
             force_all_finite=force_all_finite,
             drop_first=drop_first,
+            robust=robust,
+            expected_information=expected_information,
         )
 
     def _validate_hyperparameters(self) -> None:
@@ -2308,6 +2410,8 @@ def fit(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        store_covariance_matrix: bool = False,
+        clusters: Optional[np.ndarray] = None,
         # TODO: take out weights_sum (or use it properly)
         weights_sum: Optional[float] = None,
     ):
@@ -2343,6 +2447,15 @@ def fit(
             ``y`` by 3 if the link is linear and will multiply expected ``y`` by
             3 if the link is logarithmic.
 
+        store_covariance_matrix : bool, optional (default=False)
+            Whether to estimate and store the covariance matrix of the parameter
+            estimates. If ``True``, the covariance matrix will be available in the
+            ``covariance_matrix_`` attribute after fitting.
+
+        clusters : array-like, optional, default=None
+            Array with clusters membership. Clustered standard errors are
+            computed if clusters is not None.
+
         weights_sum: float, optional (default=None)
 
         Returns
@@ -2538,6 +2651,20 @@ def fit(
 
         self._tear_down_from_fit()
 
+        self.covariance_matrix_ = None
+        if store_covariance_matrix:
+            self.covariance_matrix(
+                X=X.unstandardize(),
+                y=y,
+                offset=offset,
+                sample_weight=sample_weight * weights_sum,
+                robust=self.robust,
+                clusters=clusters,
+                expected_information=self.expected_information,
+                store_covariance_matrix=True,
+                skip_checks=True,
+            )
+
         return self
 
     def _compute_information_criteria(
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 54979a34..cd27f1c4 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -273,6 +273,13 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
 
     deviance_path_: array, shape(n_folds, n_alphas)
         Deviance for the test set on each fold, varying alpha.
+
+    robust : bool, optional (default = False)
+        If true, then robust standard errors are computed by default.
+
+    expected_information : bool, optional (default = False)
+        If true, then the expected information matrix is computed by default.
+        Only relevant when computing robust standard errors.
     """
 
     def __init__(
@@ -308,6 +315,8 @@ def __init__(
         cv=None,
         n_jobs: Optional[int] = None,
         drop_first: bool = False,
+        robust: bool = True,
+        expected_information: bool = False,
     ):
         self.alphas = alphas
         self.cv = cv
@@ -341,6 +350,8 @@ def __init__(
             b_ineq=b_ineq,
             force_all_finite=force_all_finite,
             drop_first=drop_first,
+            robust=robust,
+            expected_information=expected_information,
         )
 
     def _validate_hyperparameters(self) -> None:
@@ -365,6 +376,8 @@ def fit(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        store_covariance_matrix: bool = False,
+        clusters: Optional[np.ndarray] = None,
     ):
         r"""
         Choose the best model along a 'regularization path' by cross-validation.
@@ -398,6 +411,15 @@ def fit(
             Added to linear predictor. An offset of 3 will increase expected
             ``y`` by 3 if the link is linear and will multiply expected ``y`` by
             3 if the link is logarithmic.
+
+        store_covariance_matrix : bool, optional (default=False)
+            Whether to store the covariance matrix of the parameter estimates
+            corresponding to the best best model.
+
+        clusters : array-like, optional, default=None
+            Array with clusters membership. Clustered standard errors are
+            computed if clusters is not None.
+
         """
         self._validate_hyperparameters()
 
@@ -694,4 +716,18 @@ def _get_deviance(coef):
 
         self._tear_down_from_fit()
 
+        self.covariance_matrix_ = None
+        if store_covariance_matrix:
+            self.covariance_matrix(
+                X=X.unstandardize(),
+                y=y,
+                offset=offset,
+                sample_weight=sample_weight * weights_sum,
+                robust=self.robust,
+                clusters=clusters,
+                expected_information=self.expected_information,
+                store_covariance_matrix=True,
+                skip_checks=True,
+            )
+
         return self
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 3890d17f..72b6086e 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2113,3 +2113,134 @@ def test_P1_P2_with_drop_first():
     regressor.fit(X, y)
     regressor = GeneralizedLinearRegressor(alpha=0.1, l1_ratio=0.5, P1=P_1, P2=P_2)
     regressor.fit(X, y)
+
+
+@pytest.mark.parametrize("clustered", [True, False], ids=["clustered", "nonclustered"])
+@pytest.mark.parametrize("expected_information", [True, False], ids=["opg", "oim"])
+@pytest.mark.parametrize("robust", [True, False], ids=["robust", "nonrobust"])
+def test_store_covariance_matrix(
+    regression_data, robust, expected_information, clustered
+):
+    X, y = regression_data
+
+    if clustered:
+        rng = np.random.default_rng(42)
+        clu = rng.integers(5, size=len(y))
+    else:
+        clu = None
+
+    regressor = GeneralizedLinearRegressor(
+        family="gaussian",
+        alpha=0,
+        robust=robust,
+        expected_information=expected_information,
+    )
+    regressor.fit(X, y, store_covariance_matrix=True, clusters=clu)
+
+    np.testing.assert_array_almost_equal(
+        regressor.covariance_matrix(
+            X, y, robust=robust, expected_information=expected_information, clusters=clu
+        ),
+        regressor.covariance_matrix(),
+    )
+
+    np.testing.assert_array_almost_equal(
+        regressor.std_errors(
+            X, y, robust=robust, expected_information=expected_information, clusters=clu
+        ),
+        regressor.std_errors(),
+    )
+
+
+def test_store_covariance_matrix_errors(regression_data):
+    X, y = regression_data
+
+    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor.fit(X, y, store_covariance_matrix=False)
+
+    with pytest.raises(ValueError, match="Either X and y must be provided"):
+        regressor.covariance_matrix()
+
+    with pytest.raises(ValueError, match="Either X and y must be provided"):
+        regressor.covariance_matrix(X=X)
+
+    with pytest.raises(ValueError, match="Either X and y must be provided"):
+        regressor.covariance_matrix(y=y)
+
+    regressor.covariance_matrix(X, y, store_covariance_matrix=True)
+
+    with pytest.raises(
+        ValueError, match="Cannot reestimate the covariance matrix with different"
+    ):
+        regressor.covariance_matrix(robust=False)
+
+    with pytest.warns(match="A covariance matrix has already been computed."):
+        regressor.covariance_matrix(X, y, store_covariance_matrix=True)
+
+    regressor_penalized = GeneralizedLinearRegressor(family="gaussian", alpha=0.1)
+    with pytest.warns(match="Covariance matrix estimation assumes"):
+        regressor_penalized.fit(X, y, store_covariance_matrix=True)
+
+
+@pytest.mark.parametrize("clustered", [True, False], ids=["clustered", "nonclustered"])
+@pytest.mark.parametrize("expected_information", [True, False], ids=["opg", "oim"])
+@pytest.mark.parametrize("robust", [True, False], ids=["robust", "nonrobust"])
+def test_store_covariance_matrix_alpha_search(
+    regression_data, robust, expected_information, clustered
+):
+    X, y = regression_data
+
+    if clustered:
+        rng = np.random.default_rng(42)
+        clu = rng.integers(5, size=len(y))
+    else:
+        clu = None
+
+    regressor = GeneralizedLinearRegressor(
+        family="gaussian",
+        alpha=[0, 0.1, 0.5],
+        alpha_search=True,
+        robust=robust,
+        expected_information=expected_information,
+    )
+    with pytest.warns(match="Covariance matrix estimation assumes"):
+        regressor.fit(X, y, store_covariance_matrix=True, clusters=clu)
+
+    np.testing.assert_array_almost_equal(
+        regressor.covariance_matrix(
+            X, y, robust=robust, expected_information=expected_information, clusters=clu
+        ),
+        regressor.covariance_matrix(),
+    )
+
+
+@pytest.mark.parametrize("clustered", [True, False], ids=["clustered", "nonclustered"])
+@pytest.mark.parametrize("expected_information", [True, False], ids=["opg", "oim"])
+@pytest.mark.parametrize("robust", [True, False], ids=["robust", "nonrobust"])
+def test_store_covariance_matrix_cv(
+    regression_data, robust, expected_information, clustered
+):
+    X, y = regression_data
+
+    if clustered:
+        rng = np.random.default_rng(42)
+        clu = rng.integers(5, size=len(y))
+    else:
+        clu = None
+
+    regressor = GeneralizedLinearRegressorCV(
+        family="gaussian",
+        n_alphas=5,
+        robust=robust,
+        expected_information=expected_information,
+    )
+    with pytest.warns(match="Covariance matrix estimation assumes"):
+        # regressor.alpha_ == 1e-5 > 0
+        regressor.fit(X, y, store_covariance_matrix=True, clusters=clu)
+
+    np.testing.assert_array_almost_equal(
+        regressor.covariance_matrix(
+            X, y, robust=robust, expected_information=expected_information, clusters=clu
+        ),
+        regressor.covariance_matrix(),
+    )

From fad75ff7d414e29f0f80f43e52076359d537a392 Mon Sep 17 00:00:00 2001
From: "quant-ranger[bot]"
 <132915763+quant-ranger[bot]@users.noreply.github.com>
Date: Mon, 14 Aug 2023 06:20:31 +0200
Subject: [PATCH 08/63] Pre-commit autoupdate (#676)

Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f1d68ec..e3748d32 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
     - id: isort-conda
       additional_dependencies: [toml]
  - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
-   rev: "1.4.1"
+   rev: "1.5.0"
    hooks:
     - id: mypy-conda
       args:
@@ -37,7 +37,7 @@ repos:
       exclude: ^tests/
       additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs]
  - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade
-   rev: 3.9.0
+   rev: 3.10.1
    hooks:
     - id: pyupgrade-conda
       exclude: ^src/glum_benchmarks/orig_sklearn_fork/

From af87010cae3b29076dca8b1cc320652612235ce7 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Mon, 14 Aug 2023 14:36:21 +0200
Subject: [PATCH 09/63] Fix covariance_matrix dtypes

---
 src/glum/_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 94b0f13d..8ba42314 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1621,7 +1621,7 @@ def covariance_matrix(
                     / (sum_weights - self.n_features_in_ - int(self.fit_intercept))
                 )
             else:
-                inner_part = gradient.sandwich(np.ones_like(y))
+                inner_part = gradient.sandwich(np.ones_like(y, dtype=X.dtype))
                 correction = sum_weights / (
                     sum_weights - self.n_features_in_ - int(self.fit_intercept)
                 )

From 940b26024f519ecea44ba9858e87d3a94b572be7 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 16 Aug 2023 09:46:34 +0200
Subject: [PATCH 10/63] Make CI use pre-release tabmat

---
 .github/workflows/conda-build.yml | 18 +++++++++---------
 .github/workflows/daily.yml       |  2 +-
 conda.recipe/meta.yaml            |  2 +-
 environment-benchmark.yml         |  1 +
 environment.yml                   |  3 ++-
 setup.py                          |  2 +-
 6 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index c18dba23..968595be 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -20,15 +20,15 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { conda_build_yml: linux_64_numpy1.20python3.8.____cpython,   os: ubuntu-latest,  conda-build-args: '' }
-          - { conda_build_yml: linux_64_numpy1.20python3.9.____cpython,   os: ubuntu-latest,  conda-build-args: '' }
-          - { conda_build_yml: linux_64_numpy1.23python3.11.____cpython,  os: ubuntu-latest,  conda-build-args: '' }
-          - { conda_build_yml: osx_64_numpy1.20python3.9.____cpython,     os: macos-latest,   conda-build-args: '' }
-          - { conda_build_yml: osx_64_numpy1.23python3.11.____cpython,    os: macos-latest,   conda-build-args: '' }
-          - { conda_build_yml: osx_arm64_numpy1.20python3.8.____cpython,  os: macos-latest,   conda-build-args: ' --no-test' }
-          - { conda_build_yml: osx_arm64_numpy1.21python3.10.____cpython, os: macos-latest,   conda-build-args: ' --no-test' }
-          - { conda_build_yml: win_64_numpy1.20python3.8.____cpython,     os: windows-latest, conda-build-args: '' }
-          - { conda_build_yml: win_64_numpy1.23python3.11.____cpython,    os: windows-latest, conda-build-args: '' }
+          - { conda_build_yml: linux_64_numpy1.20python3.8.____cpython,   os: ubuntu-latest,  conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: linux_64_numpy1.20python3.9.____cpython,   os: ubuntu-latest,  conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: linux_64_numpy1.23python3.11.____cpython,  os: ubuntu-latest,  conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: osx_64_numpy1.20python3.9.____cpython,     os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: osx_64_numpy1.23python3.11.____cpython,    os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: osx_arm64_numpy1.20python3.8.____cpython,  os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge --no-test' }
+          - { conda_build_yml: osx_arm64_numpy1.21python3.10.____cpython, os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge --no-test' }
+          - { conda_build_yml: win_64_numpy1.20python3.8.____cpython,     os: windows-latest, conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: win_64_numpy1.23python3.11.____cpython,    os: windows-latest, conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
     steps:
       - name: Checkout branch
         uses: actions/checkout@v3
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index 0a6c2bcc..1b7f62c3 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -44,7 +44,7 @@ jobs:
           done
           echo Install tabmat nightly
           micromamba remove -y --force tabmat
-          pip install git+https://github.com/Quantco/tabmat
+          pip install git+https://github.com/Quantco/tabmat@tabmat-v4
       - name: Install repository
         shell: bash -el {0}
         run: pip install --no-use-pep517 --no-deps --disable-pip-version-check -e .
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 79bee267..eb5356c8 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -35,7 +35,7 @@ requirements:
     - pandas
     - scikit-learn >=0.23
     - scipy
-    - tabmat >=3.1.0
+    - tabmat >=4.0.0a
 
 test:
   requires:
diff --git a/environment-benchmark.yml b/environment-benchmark.yml
index 6c189af5..3dd449de 100644
--- a/environment-benchmark.yml
+++ b/environment-benchmark.yml
@@ -1,5 +1,6 @@
 name: glum
 channels:
+  - conda-forge/label/tabmat_dev
   - conda-forge
   - nodefaults
 dependencies:
diff --git a/environment.yml b/environment.yml
index caf38614..0aa7c240 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,5 +1,6 @@
 name: glum
 channels:
+  - conda-forge/label/tabmat_dev
   - conda-forge
   - nodefaults
 dependencies:
@@ -8,7 +9,7 @@ dependencies:
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
   - pandas>=0.21
-  - tabmat>=3.1.0
+  - tabmat>=4.0.0a
   - scikit-learn>=0.23
   - scipy
   - tqdm
diff --git a/setup.py b/setup.py
index edc063f3..016f12c3 100644
--- a/setup.py
+++ b/setup.py
@@ -85,7 +85,7 @@
         "pandas",
         "scikit-learn>=0.23",
         "scipy",
-        "tabmat>=3.1.0",
+        "tabmat>=4.0.0a",
     ],
     entry_points=None
     if os.environ.get("CONDA_BUILD")

From fb026c5b2a09ef56371a9b89128e7542053df19c Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Thu, 17 Aug 2023 12:04:39 +0200
Subject: [PATCH 11/63] =?UTF-8?q?Column=20names=20=20=C3=A0=20la=20Tabmat?=
 =?UTF-8?q?=20#278=20(#678)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Delegate column naming to tabmat

* Add tests

* More tests

* Test for dropping complete categories

* Add docstrings for new argument

* Add changelog entry

* Convert to pandas at the correct place

* Reorganize converting from pandas

* Remove xfail from test
---
 CHANGELOG.rst         |   2 +
 src/glum/_glm.py      |  73 ++++++++++++++----------
 src/glum/_glm_cv.py   |   8 +++
 tests/glm/test_glm.py | 126 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 170 insertions(+), 39 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e45052e7..6579846e 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -14,6 +14,8 @@ Changelog
 
 - Added the complementary log-log (`cloglog`) link function.
 - Added the option to store the covariance matrix after estimating it. In this case, the covariance matrix does not have to be recomputed when calling inference methods.
+- Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
+- Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
 
 **Bug fix**
 
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 8ba42314..ffbda1a2 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -87,8 +87,8 @@
 def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwargs):
     to_copy = kwargs.get("copy", False)
 
-    if isinstance(mat, pd.DataFrame) and any(mat.dtypes == "category"):
-        mat = tm.from_pandas(mat, drop_first=drop_first)
+    if isinstance(mat, pd.DataFrame):
+        raise RuntimeError("DataFrames should have been converted by this point.")
 
     if isinstance(mat, tm.SplitMatrix):
         kwargs.update({"ensure_min_features": 0})
@@ -117,7 +117,9 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar
         res = check_array(mat, **kwargs)
 
     if res is not mat and original_type in (tm.DenseMatrix, tm.SparseMatrix):
-        res = original_type(res)  # type: ignore
+        res = original_type(
+            res, column_names=mat.column_names, term_names=mat.term_names  # type: ignore
+        )
 
     return res
 
@@ -718,6 +720,7 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        categorical_format: str = "{name}[{category}]",
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -750,6 +753,7 @@ def __init__(
         self.drop_first = drop_first
         self.robust = robust
         self.expected_information = expected_information
+        self.categorical_format = categorical_format
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -823,6 +827,19 @@ def _get_start_coef(
 
         return coef
 
+    def _convert_from_pandas(self, df: pd.DataFrame) -> tm.MatrixBase:
+        """Convert a pandas data frame to a tabmat matrix."""
+        if hasattr(self, "feature_dtypes_"):
+            df = _align_df_categories(df, self.feature_dtypes_)
+
+        X = tm.from_pandas(
+            df,
+            drop_first=self.drop_first,
+            categorical_format=self.categorical_format,
+        )
+
+        return X
+
     def _set_up_for_fit(self, y: np.ndarray) -> None:
         #######################################################################
         # 1. input validation                                                 #
@@ -1233,6 +1250,9 @@ def linear_predictor(
         elif alpha is not None:
             alpha_index = [self._find_alpha_index(a) for a in alpha]  # type: ignore
 
+        if isinstance(X, pd.DataFrame):
+            X = self._convert_from_pandas(X)
+
         X = check_array_tabmat_compliant(
             X,
             accept_sparse=["csr", "csc", "coo"],
@@ -1303,18 +1323,9 @@ def predict(
         array, shape (n_samples, n_alphas)
             Predicted values times ``sample_weight``.
         """
-        if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
-            X = _align_df_categories(X, self.feature_dtypes_)
+        if isinstance(X, pd.DataFrame):
+            X = self._convert_from_pandas(X)
 
-        X = check_array_tabmat_compliant(
-            X,
-            accept_sparse=["csr", "csc", "coo"],
-            dtype="numeric",
-            copy=self._should_copy_X(),
-            ensure_2d=True,
-            allow_nd=False,
-            drop_first=self.drop_first,
-        )
         eta = self.linear_predictor(
             X, offset=offset, alpha_index=alpha_index, alpha=alpha
         )
@@ -1539,8 +1550,8 @@ def covariance_matrix(
                     )
                 return self.covariance_matrix_
 
-            if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
-                X = _align_df_categories(X, self.feature_dtypes_)
+            if isinstance(X, pd.DataFrame):
+                X = self._convert_from_pandas(X)
 
             X, y = check_X_y_tabmat_compliant(
                 X,
@@ -1839,16 +1850,6 @@ def _set_up_and_check_fit_args(
             self.feature_dtypes_ = X.dtypes.to_dict()
 
             if any(X.dtypes == "category"):
-                self.feature_names_ = list(
-                    chain.from_iterable(
-                        _name_categorical_variables(
-                            dtype.categories, column, self.drop_first
-                        )
-                        if pd.api.types.is_categorical_dtype(dtype)
-                        else [column]
-                        for column, dtype in zip(X.columns, X.dtypes)
-                    )
-                )
 
                 def _expand_categorical_penalties(penalty, X, drop_first):
                     """
@@ -1885,10 +1886,11 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                 P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
                 P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
 
-                X = tm.from_pandas(X, drop_first=self.drop_first)
-            else:
-                self.feature_names_ = X.columns
-                X = tm.from_pandas(X)
+            X = tm.from_pandas(
+                X,
+                drop_first=self.drop_first,
+                categorical_format=self.categorical_format,
+            )
 
         if not self._is_contiguous(X):
             if self.copy_X is not None and not self.copy_X:
@@ -1961,6 +1963,9 @@ def _expand_categorical_penalties(penalty, X, drop_first):
         #######################################################################
         X = tm.as_tabmat(X)
 
+        self.feature_names_ = X.get_names(type="column", missing_prefix="_col_")
+        self.term_names_ = X.get_names(type="term", missing_prefix="_col_")
+
         return X, y, sample_weight, offset, weights_sum, P1, P2
 
 
@@ -2246,6 +2251,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
 
+    categorical_features : str, optional (default = "{name}[{category}]")
+        Format string for categorical features. The format string should
+        contain the placeholder ``{name}`` for the feature name and
+        ``{category}`` for the category name. Only used if ``X`` is a pandas
+        DataFrame.
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -2329,6 +2340,7 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        categorical_format: str = "{name}[{category}]",
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -2364,6 +2376,7 @@ def __init__(
             drop_first=drop_first,
             robust=robust,
             expected_information=expected_information,
+            categorical_format=categorical_format,
         )
 
     def _validate_hyperparameters(self) -> None:
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index cd27f1c4..cf641612 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -280,6 +280,12 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
     expected_information : bool, optional (default = False)
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
+
+    categorical_features : str, optional (default = "{name}[{category}]")
+        Format string for categorical features. The format string should
+        contain the placeholder ``{name}`` for the feature name and
+        ``{category}`` for the category name. Only used if ``X`` is a pandas
+        DataFrame.
     """
 
     def __init__(
@@ -317,6 +323,7 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        categorical_format: str = "{name}[{category}]",
     ):
         self.alphas = alphas
         self.cv = cv
@@ -352,6 +359,7 @@ def __init__(
             drop_first=drop_first,
             robust=robust,
             expected_information=expected_information,
+            categorical_format=categorical_format,
         )
 
     def _validate_hyperparameters(self) -> None:
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 72b6086e..f1ded8fb 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -1724,7 +1724,7 @@ def test_passing_noncontiguous_as_X():
     "X, feature_names",
     [
         (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
-        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), None),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), ["_col_0", "_col_1"]),
         (
             pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
             np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2"]),
@@ -1738,13 +1738,122 @@ def test_passing_noncontiguous_as_X():
             ),
             np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2__2"]),
         ),
+        (
+            tm.SplitMatrix(
+                [
+                    tm.CategoricalMatrix(
+                        np.arange(5), column_name_format="{name}__{category}"
+                    ),
+                    tm.DenseMatrix(np.ones((5, 1))),
+                ]
+            ),
+            np.array(
+                [
+                    "_col_0-4__0",
+                    "_col_0-4__1",
+                    "_col_0-4__2",
+                    "_col_0-4__3",
+                    "_col_0-4__4",
+                    "_col_5",
+                ]
+            ),
+        ),
     ],
 )
-def test_feature_names(X, feature_names):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+def test_feature_names_underscores(X, feature_names):
+    model = GeneralizedLinearRegressor(
+        family="poisson", categorical_format="{name}__{category}"
+    ).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
 
 
+@pytest.mark.parametrize(
+    "X, feature_names",
+    [
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), ["_col_0", "_col_1"]),
+        (
+            pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
+            np.array(["x1[0]", "x1[1]", "x1[2]", "x1[3]", "x1[4]", "x2"]),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "x1": pd.Categorical(np.arange(5)),
+                    "x2": pd.Categorical([2, 2, 2, 2, 2]),
+                }
+            ),
+            np.array(["x1[0]", "x1[1]", "x1[2]", "x1[3]", "x1[4]", "x2[2]"]),
+        ),
+        (
+            tm.SplitMatrix(
+                [
+                    tm.CategoricalMatrix(
+                        np.arange(5), column_name_format="{name}[{category}]"
+                    ),
+                    tm.DenseMatrix(np.ones((5, 1))),
+                ]
+            ),
+            np.array(
+                [
+                    "_col_0-4[0]",
+                    "_col_0-4[1]",
+                    "_col_0-4[2]",
+                    "_col_0-4[3]",
+                    "_col_0-4[4]",
+                    "_col_5",
+                ]
+            ),
+        ),
+    ],
+)
+def test_feature_names_brackets(X, feature_names):
+    model = GeneralizedLinearRegressor(
+        family="poisson", categorical_format="{name}[{category}]"
+    ).fit(X, np.arange(5))
+    np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
+
+
+@pytest.mark.parametrize(
+    "X, term_names",
+    [
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), ["_col_0", "_col_1"]),
+        (
+            pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
+            np.array(["x1", "x1", "x1", "x1", "x1", "x2"]),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "x1": pd.Categorical(np.arange(5)),
+                    "x2": pd.Categorical([2, 2, 2, 2, 2]),
+                }
+            ),
+            np.array(["x1", "x1", "x1", "x1", "x1", "x2"]),
+        ),
+        (
+            tm.SplitMatrix(
+                [tm.CategoricalMatrix(np.arange(5)), tm.DenseMatrix(np.ones((5, 1)))]
+            ),
+            np.array(
+                [
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_5",
+                ]
+            ),
+        ),
+    ],
+)
+def test_term_names(X, term_names):
+    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    np.testing.assert_array_equal(getattr(model, "term_names_", None), term_names)
+
+
 @pytest.mark.parametrize(
     "X, dtypes",
     [
@@ -2090,15 +2199,14 @@ def test_drop_first_allows_alpha_equals_0():
         regressor.fit(X, y)
 
 
-def test_error_on_distinct_categorical_column():
+def test_dropping_distinct_categorical_column():
     y = np.random.normal(size=10)
-    X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10))})
+    X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10)), "num": np.ones(10)})
     regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
-    with pytest.raises(ValueError):
-        regressor.fit(X, y)
-
-    regressor = GeneralizedLinearRegressor(alpha=0)
     regressor.fit(X, y)
+    assert regressor.coef_.shape == (1,)
+    assert regressor.feature_names_ == ["num"]
+    assert regressor.term_names_ == ["num"]
 
 
 def test_P1_P2_with_drop_first():

From 9a20282424101d02ad9c647e24505501e0bfadc8 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Mon, 28 Aug 2023 08:07:02 +0200
Subject: [PATCH 12/63] Formula interface (#670)

* Add formulaic to dependencies

* Add function for transforming the formula

* Add tests

* First draft of glum formula interface

* Fixes and tests

* Handle intercept correctly

* Add formula functionality to glm_cv

* Variables from local context

* Test predict with formulas

* Add formula tutorial

* Fix tutorial

* Reformat tutorial

* Improve function signatures adn docstrings

* Handle two-sided formulas in covariance_matrix

* Make mypy happy about module names

* Matthias' suggestions

* Improve tutorial

* Improve tutorial
---
 CHANGELOG.rst                                 |    1 +
 conda.recipe/meta.yaml                        |    1 +
 .../formula_interface/formula_interface.ipynb | 1709 +++++++++++++++++
 .../load_transform_formula.py                 |   63 +
 docs/tutorials/tutorials.rst                  |    1 +
 environment.yml                               |    1 +
 setup.py                                      |    1 +
 src/glum/_glm.py                              |  230 ++-
 src/glum/_glm_cv.py                           |   22 +
 tests/glm/test_glm.py                         |  306 ++-
 10 files changed, 2289 insertions(+), 46 deletions(-)
 create mode 100644 docs/tutorials/formula_interface/formula_interface.ipynb
 create mode 100644 docs/tutorials/formula_interface/load_transform_formula.py

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 4c756af5..f987f137 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,6 +12,7 @@ Changelog
 
 **New features**
 
+- Added a formula interface for specifying models.
 - Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
 - Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
 
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index eb5356c8..54523bc8 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -35,6 +35,7 @@ requirements:
     - pandas
     - scikit-learn >=0.23
     - scipy
+    - formulaic >=0.4
     - tabmat >=4.0.0a
 
 test:
diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
new file mode 100644
index 00000000..9890d28b
--- /dev/null
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -0,0 +1,1709 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Formula Interface Tutorial: Revisiting French Motor Third-Party Liability Claims\n",
+    "\n",
+    "\n",
+    "**Intro**\n",
+    "\n",
+    "This tutorial showcases the formula interface of `glum`. It allows for the specification of the design matrix and the response variable using so-called [Wilkinson-formulas](https://www.jstor.org/stable/2346786) instead of constructing it by hand. This kind of model specification should be familiar to R users or those who have used the `statsmodels` or `linearmodels` Python packages before. This tutorial aims to introduce the basics of working with formulas to other users, as well as highlighting some important differences between `glum`s and other packages' formula implementations.\n",
+    "\n",
+    "For a more in-depth look at how formulas work, please take a look at the [documentation of `formulaic`](https://matthewwardrop.github.io/formulaic/), the package on which `glum`'s formula interface is based.\n",
+    "\n",
+    "\n",
+    "**Background**\n",
+    "\n",
+    "This tutorial reimplements and extends the combined frequency-severity model from Chapter 4 of the [GLM tutorial](tutorials/glm_french_motor_tutorial/glm_french_motor.html). If you would like to know more about the setting, the data, or GLM modeling in general, please check that out first.\n",
+    "\n",
+    "**Sneak Peak**\n",
+    "\n",
+    "Formulas can provide a concise and convenient way to specify many of the usual pre-processing steps, such as converting to categorical types, creating interactions, applying transformations, or even spline interpolation. As an example, consider the following formula:\n",
+    "\n",
+    "```\n",
+    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3) + 1\n",
+    "```\n",
+    "\n",
+    "Despite its brevity, it describes all of the following:\n",
+    " - The outcome variable is the ratio of `ClaimAmountCut` and `Exposure`.\n",
+    " - The predictors should include the interactions of the categorical variables `DrivAge` and `VehPower`, as well as those two variables themselves. (Even though they behave as such, neither the individual variables nor their interaction will be dummy-encoded by glum. For categoricals with many levels, this can lead to a substantial performance improvement over dummy encoding, especially for the interaction.)\n",
+    " - If there are missing values in `DrivAge`, they should be treated as a separate category.\n",
+    " - On the other hand, missing values in `VehPower` should be treated as all-zero indicators.\n",
+    " - The predictors should also include a third degree B-spline interpolation of `BonusMalus`.\n",
+    " - The model should include an intercept.\n",
+    "\n",
+    "The following chapters demonstrate each of these features in some detail, as well as some additional advantages of using the formula interface."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "* [1. Load and Prepare Datasets from Openml](#1.-Load-and-Prepare-Datasets-from-Openml)\n",
+    "* [2. Reproducing the model from the GLM Tutorial](#2.-Reproducing-the-model-from-the-GLM-Tutorial)\n",
+    "* [3. Categorical Variables](#3.-Categorical-Variables)\n",
+    "* [4. Interactions and Structural Full-rankness](#4.-Interactions-and-Structural-Full-rankness)\n",
+    "* [5. Fun with Functions](#5.-Fun-with-Functions)\n",
+    "* [6. Miscellaneous Features](#6.-Miscellaneous-Features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scipy.optimize as optimize\n",
+    "import scipy.stats\n",
+    "from dask_ml.preprocessing import Categorizer\n",
+    "from sklearn.metrics import mean_absolute_error\n",
+    "from sklearn.model_selection import ShuffleSplit\n",
+    "from glum import GeneralizedLinearRegressor\n",
+    "from glum import TweedieDistribution\n",
+    "\n",
+    "from load_transform_formula import load_transform"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load and Prepare Datasets from Openml<a class=\"anchor\"></a>\n",
+    "[back to table of contents](#Table-of-Contents)\n",
+    "\n",
+    "First, we load in our [dataset from openML](\"https://www.openml.org/d/41214\") and apply several transformations. In the interest of simplicity, we do not include the data loading and preparation code in this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ClaimNb</th>\n",
+       "      <th>Exposure</th>\n",
+       "      <th>Area</th>\n",
+       "      <th>VehPower</th>\n",
+       "      <th>VehAge</th>\n",
+       "      <th>DrivAge</th>\n",
+       "      <th>BonusMalus</th>\n",
+       "      <th>VehBrand</th>\n",
+       "      <th>VehGas</th>\n",
+       "      <th>Density</th>\n",
+       "      <th>Region</th>\n",
+       "      <th>ClaimAmount</th>\n",
+       "      <th>ClaimAmountCut</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IDpol</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.10000</td>\n",
+       "      <td>D</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>1217</td>\n",
+       "      <td>R82</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.77000</td>\n",
+       "      <td>D</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>1217</td>\n",
+       "      <td>R82</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.75000</td>\n",
+       "      <td>B</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>54</td>\n",
+       "      <td>R22</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.09000</td>\n",
+       "      <td>B</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>76</td>\n",
+       "      <td>R72</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.84000</td>\n",
+       "      <td>B</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>76</td>\n",
+       "      <td>R72</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114326</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>E</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>3317</td>\n",
+       "      <td>R93</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114327</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>E</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>95</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>9850</td>\n",
+       "      <td>R11</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114328</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>D</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>1323</td>\n",
+       "      <td>R82</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114329</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>B</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>95</td>\n",
+       "      <td>R26</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114330</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>B</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>54</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>65</td>\n",
+       "      <td>R72</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>678013 rows × 13 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         ClaimNb  Exposure Area  VehPower  VehAge  DrivAge  BonusMalus  \\\n",
+       "IDpol                                                                    \n",
+       "1              0   0.10000    D         5       0        5          50   \n",
+       "3              0   0.77000    D         5       0        5          50   \n",
+       "5              0   0.75000    B         6       1        5          50   \n",
+       "10             0   0.09000    B         7       0        4          50   \n",
+       "11             0   0.84000    B         7       0        4          50   \n",
+       "...          ...       ...  ...       ...     ...      ...         ...   \n",
+       "6114326        0   0.00274    E         4       0        5          50   \n",
+       "6114327        0   0.00274    E         4       0        4          95   \n",
+       "6114328        0   0.00274    D         6       1        4          50   \n",
+       "6114329        0   0.00274    B         4       0        5          50   \n",
+       "6114330        0   0.00274    B         7       1        2          54   \n",
+       "\n",
+       "        VehBrand   VehGas  Density Region  ClaimAmount  ClaimAmountCut  \n",
+       "IDpol                                                                   \n",
+       "1            B12  Regular     1217    R82          0.0             0.0  \n",
+       "3            B12  Regular     1217    R82          0.0             0.0  \n",
+       "5            B12   Diesel       54    R22          0.0             0.0  \n",
+       "10           B12   Diesel       76    R72          0.0             0.0  \n",
+       "11           B12   Diesel       76    R72          0.0             0.0  \n",
+       "...          ...      ...      ...    ...          ...             ...  \n",
+       "6114326      B12  Regular     3317    R93          0.0             0.0  \n",
+       "6114327      B12  Regular     9850    R11          0.0             0.0  \n",
+       "6114328      B12   Diesel     1323    R82          0.0             0.0  \n",
+       "6114329      B12  Regular       95    R26          0.0             0.0  \n",
+       "6114330      B12   Diesel       65    R72          0.0             0.0  \n",
+       "\n",
+       "[678013 rows x 13 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df = load_transform()\n",
+    "with pd.option_context('display.max_rows', 10):\n",
+    "    display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Reproducing the Model From the GLM Turorial<a class=\"anchor\"></a>\n",
+    "\n",
+    "Now, let us start by fitting a very simple model. As usual, let's divide our samples into a training and a test set so that we get valid out-of-sample goodness-of-fit measures. Perhaps less usually, we do not create separate `y` and `X` data frames for our label and features – the formula will take care of that for us.\n",
+    "\n",
+    "We still have some preprocessing to do:\n",
+    " - Many of the ordinal or nominal variables are encoded as integers, instead of as categoricals. We will need to convert these so that `glum` will know to estimate a separate coefficient for each of their levels.\n",
+    " - The outcome variable is a transformation of other columns. We need to create it first.\n",
+    "\n",
+    "As we will see later on, these steps can be incorporated into the formula itself, but let's not overcomplicate things at first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=42)\n",
+    "train, test = next(ss.split(df))\n",
+    "\n",
+    "df = df.assign(PurePremium=lambda x: x[\"ClaimAmountCut\"] / x[\"Exposure\"])\n",
+    "\n",
+    "glm_categorizer = Categorizer(\n",
+    "    columns=[\"VehBrand\", \"VehGas\", \"Region\", \"Area\", \"DrivAge\", \"VehAge\", \"VehPower\"]\n",
+    ")\n",
+    "df_train = glm_categorizer.fit_transform(df.iloc[train])\n",
+    "df_test = glm_categorizer.transform(df.iloc[test])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This example demonstrates the basic idea behind formulas: the outcome variable and the predictors are separated by a tilde (`~`), and different prefictors are separated by plus signs (`+`). Thus, formulas provide a concise way of specifying a model without the need to create dataframes by hand."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>VehBrand[B1]</th>\n",
+       "      <th>VehBrand[B10]</th>\n",
+       "      <th>VehBrand[B11]</th>\n",
+       "      <th>VehBrand[B12]</th>\n",
+       "      <th>VehBrand[B13]</th>\n",
+       "      <th>VehBrand[B14]</th>\n",
+       "      <th>VehBrand[B2]</th>\n",
+       "      <th>VehBrand[B3]</th>\n",
+       "      <th>VehBrand[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>VehAge[1]</th>\n",
+       "      <th>VehAge[2]</th>\n",
+       "      <th>VehPower[4]</th>\n",
+       "      <th>VehPower[5]</th>\n",
+       "      <th>VehPower[6]</th>\n",
+       "      <th>VehPower[7]</th>\n",
+       "      <th>VehPower[8]</th>\n",
+       "      <th>VehPower[9]</th>\n",
+       "      <th>BonusMalus</th>\n",
+       "      <th>Density</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>2.88667</td>\n",
+       "      <td>-0.064157</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.231868</td>\n",
+       "      <td>-0.211061</td>\n",
+       "      <td>0.054979</td>\n",
+       "      <td>-0.270346</td>\n",
+       "      <td>-0.071453</td>\n",
+       "      <td>0.00291</td>\n",
+       "      <td>0.059324</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.008117</td>\n",
+       "      <td>-0.229906</td>\n",
+       "      <td>-0.111796</td>\n",
+       "      <td>-0.123388</td>\n",
+       "      <td>0.060757</td>\n",
+       "      <td>0.005179</td>\n",
+       "      <td>-0.021832</td>\n",
+       "      <td>0.208158</td>\n",
+       "      <td>0.032508</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 60 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  VehBrand[B1]  VehBrand[B10]  VehBrand[B11]  \\\n",
+       "coefficient    2.88667     -0.064157            0.0       0.231868   \n",
+       "\n",
+       "             VehBrand[B12]  VehBrand[B13]  VehBrand[B14]  VehBrand[B2]  \\\n",
+       "coefficient      -0.211061       0.054979      -0.270346     -0.071453   \n",
+       "\n",
+       "             VehBrand[B3]  VehBrand[B4]  ...  VehAge[1]  VehAge[2]  \\\n",
+       "coefficient       0.00291      0.059324  ...   0.008117  -0.229906   \n",
+       "\n",
+       "             VehPower[4]  VehPower[5]  VehPower[6]  VehPower[7]  VehPower[8]  \\\n",
+       "coefficient    -0.111796    -0.123388     0.060757     0.005179    -0.021832   \n",
+       "\n",
+       "             VehPower[9]  BonusMalus   Density  \n",
+       "coefficient     0.208158    0.032508  0.000002  \n",
+       "\n",
+       "[1 rows x 60 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula = \"PurePremium ~ VehBrand + VehGas + Region + Area + DrivAge + VehAge + VehPower + BonusMalus + Density\"\n",
+    "\n",
+    "TweedieDist = TweedieDistribution(1.5)\n",
+    "t_glm1 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula,\n",
+    ")\n",
+    "t_glm1.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm1.intercept_], t_glm1.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm1.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Categorical Variables<a class=\"anchor\"></a>\n",
+    "\n",
+    "`glum` also provides extensive support for categorical variables. The main function one needs to be aware of in the context of categoricals is simply called `C()`. A variable placed within it is always converted to a categorical, regardless of its type.\n",
+    "\n",
+    "A huge part of tabmat's/glum's performance advantage is that categoricals need not be one-hot encoded, but are treated as if they were. For this reason, we do not support using other coding schemes within the formula interface. If one needs to use other categorical encodings than one-hot, they can always do so manually (or even using `formulaic` directly) before the estimation.\n",
+    "\n",
+    "Let's try it out on our dataset!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ClaimNb             int64\n",
+       "Exposure          float64\n",
+       "Area               object\n",
+       "VehPower            int64\n",
+       "VehAge              int64\n",
+       "DrivAge             int64\n",
+       "BonusMalus          int64\n",
+       "VehBrand           object\n",
+       "VehGas             object\n",
+       "Density             int64\n",
+       "Region             object\n",
+       "ClaimAmount       float64\n",
+       "ClaimAmountCut    float64\n",
+       "PurePremium       float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train_noncat = df.iloc[train]\n",
+    "df_test_noncat = df.iloc[test]\n",
+    "\n",
+    "df_train_noncat.dtypes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Even though some of the variables are integers in this dataset, they are handled as categoricals thanks to the `C()` function. Strings, such as `VehBrand` or `VehGas` would have been handled as categorical by default anyway, but using the `C()` function never hurts: if applied to something that is already a caetgorical variable, it does not have any effect outside of the feature name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(VehBrand)[B1]</th>\n",
+       "      <th>C(VehBrand)[B10]</th>\n",
+       "      <th>C(VehBrand)[B11]</th>\n",
+       "      <th>C(VehBrand)[B12]</th>\n",
+       "      <th>C(VehBrand)[B13]</th>\n",
+       "      <th>C(VehBrand)[B14]</th>\n",
+       "      <th>C(VehBrand)[B2]</th>\n",
+       "      <th>C(VehBrand)[B3]</th>\n",
+       "      <th>C(VehBrand)[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>C(VehAge)[1]</th>\n",
+       "      <th>C(VehAge)[2]</th>\n",
+       "      <th>C(VehPower)[4]</th>\n",
+       "      <th>C(VehPower)[5]</th>\n",
+       "      <th>C(VehPower)[6]</th>\n",
+       "      <th>C(VehPower)[7]</th>\n",
+       "      <th>C(VehPower)[8]</th>\n",
+       "      <th>C(VehPower)[9]</th>\n",
+       "      <th>BonusMalus</th>\n",
+       "      <th>Density</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>2.88667</td>\n",
+       "      <td>-0.064157</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.231868</td>\n",
+       "      <td>-0.211061</td>\n",
+       "      <td>0.054979</td>\n",
+       "      <td>-0.270346</td>\n",
+       "      <td>-0.071453</td>\n",
+       "      <td>0.00291</td>\n",
+       "      <td>0.059324</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.008117</td>\n",
+       "      <td>-0.229906</td>\n",
+       "      <td>-0.111796</td>\n",
+       "      <td>-0.123388</td>\n",
+       "      <td>0.060757</td>\n",
+       "      <td>0.005179</td>\n",
+       "      <td>-0.021832</td>\n",
+       "      <td>0.208158</td>\n",
+       "      <td>0.032508</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 60 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(VehBrand)[B1]  C(VehBrand)[B10]  C(VehBrand)[B11]  \\\n",
+       "coefficient    2.88667        -0.064157               0.0          0.231868   \n",
+       "\n",
+       "             C(VehBrand)[B12]  C(VehBrand)[B13]  C(VehBrand)[B14]  \\\n",
+       "coefficient         -0.211061          0.054979         -0.270346   \n",
+       "\n",
+       "             C(VehBrand)[B2]  C(VehBrand)[B3]  C(VehBrand)[B4]  ...  \\\n",
+       "coefficient        -0.071453          0.00291         0.059324  ...   \n",
+       "\n",
+       "             C(VehAge)[1]  C(VehAge)[2]  C(VehPower)[4]  C(VehPower)[5]  \\\n",
+       "coefficient      0.008117     -0.229906       -0.111796       -0.123388   \n",
+       "\n",
+       "             C(VehPower)[6]  C(VehPower)[7]  C(VehPower)[8]  C(VehPower)[9]  \\\n",
+       "coefficient        0.060757        0.005179       -0.021832        0.208158   \n",
+       "\n",
+       "             BonusMalus   Density  \n",
+       "coefficient    0.032508  0.000002  \n",
+       "\n",
+       "[1 rows x 60 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_cat = (\n",
+    "    \"PurePremium ~ C(VehBrand) + C(VehGas) + C(Region) + C(Area) \"\n",
+    "    \"+ C(DrivAge) + C(VehAge) + C(VehPower) + BonusMalus + Density\"\n",
+    ")\n",
+    "\n",
+    "t_glm3 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_cat,\n",
+    ")\n",
+    "t_glm3.fit(df_train_noncat, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm3.intercept_], t_glm3.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm3.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, prediction works as expected with categorical variables. `glum` keeps track of the levels present in the training dataset, and makes sure that categorical variables in unseen datasets are also properly aligned, even if they have missing or unknown levels.<sup>3</sup> Therefore, one can simply use predict, and `glum` does The Right Thing™ by default.\n",
+    "\n",
+    "<sup>3</sup>: This is made possible due to `glum` saving a [`ModelSpec` object](https://matthewwardrop.github.io/formulaic/guides/model_specs/), which contains any information necessary for reapplying the transitions that were done during the formula materialization process. It is especially relevant in the case of [stateful transforms](https://matthewwardrop.github.io/formulaic/guides/transforms/), such as creating categorical variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([303.77443311, 548.47789523, 244.34438579, ..., 109.81572865,\n",
+       "        67.98332028, 297.21717383])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t_glm3.predict(df_test_noncat)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Interactions and Structural Full-Rankness<a class=\"anchor\"></a>\n",
+    "\n",
+    "One of the biggest strengths of Wilkinson-formuals lie in their ability of concisely specifying interactions between terms. `glum` implements this as well, and in a very efficient way: the interactions of categorical features are encoded as a new categorical feature, making it possible to interact high-cardinality categoricals with each other. If this is not possible, because, for example, a categorical is interacted with a numeric variable, sparse representations are used when appropriate. In general, just as with `glum`'s categorical handling in general, you can be assured that `glum` you don't have to worry too much about the actual implementation, and can expect that `glum` will do the most efficient thing behind the scenes.\n",
+    "\n",
+    "Let's see how that looks like on the insurance example! Suppose that we expect `VehPower` to have a different effect depending on `DrivAge` (e.g. performance cars might not be great for new drivers, but may be less problematic for more experienced ones). We can include the interaction of these variables as follows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(VehBrand)[B1]</th>\n",
+       "      <th>C(VehBrand)[B10]</th>\n",
+       "      <th>C(VehBrand)[B11]</th>\n",
+       "      <th>C(VehBrand)[B12]</th>\n",
+       "      <th>C(VehBrand)[B13]</th>\n",
+       "      <th>C(VehBrand)[B14]</th>\n",
+       "      <th>C(VehBrand)[B2]</th>\n",
+       "      <th>C(VehBrand)[B3]</th>\n",
+       "      <th>C(VehBrand)[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>C(DrivAge)[4]:C(VehPower)[8]</th>\n",
+       "      <th>C(DrivAge)[5]:C(VehPower)[8]</th>\n",
+       "      <th>C(DrivAge)[6]:C(VehPower)[8]</th>\n",
+       "      <th>C(DrivAge)[0]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[1]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[2]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[3]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[4]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[5]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[6]:C(VehPower)[9]</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>2.88023</td>\n",
+       "      <td>-0.069076</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.221037</td>\n",
+       "      <td>-0.211854</td>\n",
+       "      <td>0.052355</td>\n",
+       "      <td>-0.272058</td>\n",
+       "      <td>-0.074836</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.052523</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.147844</td>\n",
+       "      <td>-0.03567</td>\n",
+       "      <td>0.504407</td>\n",
+       "      <td>0.682528</td>\n",
+       "      <td>-0.106569</td>\n",
+       "      <td>-0.308257</td>\n",
+       "      <td>0.173206</td>\n",
+       "      <td>0.010684</td>\n",
+       "      <td>-0.220273</td>\n",
+       "      <td>0.070334</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 102 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(VehBrand)[B1]  C(VehBrand)[B10]  C(VehBrand)[B11]  \\\n",
+       "coefficient    2.88023        -0.069076               0.0          0.221037   \n",
+       "\n",
+       "             C(VehBrand)[B12]  C(VehBrand)[B13]  C(VehBrand)[B14]  \\\n",
+       "coefficient         -0.211854          0.052355         -0.272058   \n",
+       "\n",
+       "             C(VehBrand)[B2]  C(VehBrand)[B3]  C(VehBrand)[B4]  ...  \\\n",
+       "coefficient        -0.074836              0.0         0.052523  ...   \n",
+       "\n",
+       "             C(DrivAge)[4]:C(VehPower)[8]  C(DrivAge)[5]:C(VehPower)[8]  \\\n",
+       "coefficient                     -0.147844                      -0.03567   \n",
+       "\n",
+       "             C(DrivAge)[6]:C(VehPower)[8]  C(DrivAge)[0]:C(VehPower)[9]  \\\n",
+       "coefficient                      0.504407                      0.682528   \n",
+       "\n",
+       "             C(DrivAge)[1]:C(VehPower)[9]  C(DrivAge)[2]:C(VehPower)[9]  \\\n",
+       "coefficient                     -0.106569                     -0.308257   \n",
+       "\n",
+       "             C(DrivAge)[3]:C(VehPower)[9]  C(DrivAge)[4]:C(VehPower)[9]  \\\n",
+       "coefficient                      0.173206                      0.010684   \n",
+       "\n",
+       "             C(DrivAge)[5]:C(VehPower)[9]  C(DrivAge)[6]:C(VehPower)[9]  \n",
+       "coefficient                     -0.220273                      0.070334  \n",
+       "\n",
+       "[1 rows x 102 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_int = (\n",
+    "    \"PurePremium ~ C(VehBrand) + C(VehGas) + C(Region) + C(Area)\"\n",
+    "    \" + C(DrivAge) * C(VehPower) + C(VehAge) + BonusMalus + Density\"\n",
+    ")\n",
+    "\n",
+    "t_glm4 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_int,\n",
+    ")\n",
+    "t_glm4.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm4.intercept_], t_glm4.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm4.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that, in addition to the interactions, the non-interacted variants of `DrivAge` and `VehPower` are also included in the model. This is a result of using the `*` operator to interact the variables. Using `:` instead would only include the interactions, and not the marginals. (In short, `a * b` is equivalent to `a + b + a:b`.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['C(VehPower)[4]',\n",
+       " 'C(VehPower)[5]',\n",
+       " 'C(VehPower)[6]',\n",
+       " 'C(VehPower)[7]',\n",
+       " 'C(VehPower)[8]',\n",
+       " 'C(VehPower)[9]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[9]']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[name for name in t_glm4.feature_names_ if \"VehPower\" in name]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The attentive reader might have also noticed that the first level of each categorical variable is omitted from the model. This is a manifestation of the more general concept of [ensuring structural full-rankedness](https://matthewwardrop.github.io/formulaic/guides/contrasts/#guaranteeing-structural-full-rankness)<sup>4</sup>. By default, `glum` and `formulaic` will try to make sure that one does not fall into the [Dummy Variable Trap](https://en.wikipedia.org/wiki/Dummy_variable_(statistics)). Moreover, it even does it in the case of (possibly multi-way) interactions involving categorical variables. It will always drop the necessary number of levels, and no more. If you want to opt out of this behavior (for example because you would like to penalize all levels equally), simply set the `drop_first` parameter during model initialization to `False`. If one only aims to include all levels of a certain variable, and not others, it is possible to do so by using the `spans_intercept` parameter (e.g. `C(VehPower, spans_intercept=False)` would include all levels of `VehPower` even if `drop_first` is set to `True`).\n",
+    "\n",
+    "<sup>4</sup>: Note, that it does not guarantee that the design matrix is actually full rank. For example, two identical numerical variables will still lead to a rank-deficient design matrix."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Fun with Functions<a class=\"anchor\"></a>\n",
+    "\n",
+    "The previous example is only scratching the surface of what formulas are capable of. For example, they are capable of evaluating arbitrary Python expressions, which act as if they saw the columns of the input data frame as local variables (`pandas.Series`). The way to tell `glum` that a part of the formula should be evaluated as a Python expression before applying the formula grammar to it is to enclose it in curly braces. As an example, we can easily do the following within the formula itself:\n",
+    "\n",
+    " - Create the outcome variable on the fly instead of doing it beforehand.\n",
+    " - Include the logarithm of a certain variable in the model.<sup>1</sup>\n",
+    " - Include a basis spline interpolation of a variable to capture non-linearities in its effect.<sup>2</sup>\n",
+    "\n",
+    "Let's try it out!\n",
+    "\n",
+    "<sup>1</sup>: This works because formulas can include variables from the local scope, such as the imported `numpy` namespace. (Even more precisely, certain often-used `numpy` functions are special-cased, so the curly braces are not even strictly necessary here.)\n",
+    "\n",
+    "<sup>2</sup>: `bs` is one of the several built-in `formulaic` functions that aim to simplify preprocessing steps. You can learn more about them [in `formulaic`'s docs](https://matthewwardrop.github.io/formulaic/guides/transforms/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>VehBrand[B1]</th>\n",
+       "      <th>VehBrand[B10]</th>\n",
+       "      <th>VehBrand[B11]</th>\n",
+       "      <th>VehBrand[B12]</th>\n",
+       "      <th>VehBrand[B13]</th>\n",
+       "      <th>VehBrand[B14]</th>\n",
+       "      <th>VehBrand[B2]</th>\n",
+       "      <th>VehBrand[B3]</th>\n",
+       "      <th>VehBrand[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>VehPower[4]</th>\n",
+       "      <th>VehPower[5]</th>\n",
+       "      <th>VehPower[6]</th>\n",
+       "      <th>VehPower[7]</th>\n",
+       "      <th>VehPower[8]</th>\n",
+       "      <th>VehPower[9]</th>\n",
+       "      <th>bs(BonusMalus, 3)[1]</th>\n",
+       "      <th>bs(BonusMalus, 3)[2]</th>\n",
+       "      <th>bs(BonusMalus, 3)[3]</th>\n",
+       "      <th>np.log(Density)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>3.808829</td>\n",
+       "      <td>-0.060201</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.242194</td>\n",
+       "      <td>-0.202517</td>\n",
+       "      <td>0.063471</td>\n",
+       "      <td>-0.345415</td>\n",
+       "      <td>-0.072546</td>\n",
+       "      <td>0.00777</td>\n",
+       "      <td>0.079391</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.113038</td>\n",
+       "      <td>-0.127255</td>\n",
+       "      <td>0.060209</td>\n",
+       "      <td>0.005577</td>\n",
+       "      <td>-0.032114</td>\n",
+       "      <td>0.207355</td>\n",
+       "      <td>3.178178</td>\n",
+       "      <td>0.361951</td>\n",
+       "      <td>8.231846</td>\n",
+       "      <td>0.121944</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 62 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  VehBrand[B1]  VehBrand[B10]  VehBrand[B11]  \\\n",
+       "coefficient   3.808829     -0.060201            0.0       0.242194   \n",
+       "\n",
+       "             VehBrand[B12]  VehBrand[B13]  VehBrand[B14]  VehBrand[B2]  \\\n",
+       "coefficient      -0.202517       0.063471      -0.345415     -0.072546   \n",
+       "\n",
+       "             VehBrand[B3]  VehBrand[B4]  ...  VehPower[4]  VehPower[5]  \\\n",
+       "coefficient       0.00777      0.079391  ...    -0.113038    -0.127255   \n",
+       "\n",
+       "             VehPower[6]  VehPower[7]  VehPower[8]  VehPower[9]  \\\n",
+       "coefficient     0.060209     0.005577    -0.032114     0.207355   \n",
+       "\n",
+       "             bs(BonusMalus, 3)[1]  bs(BonusMalus, 3)[2]  bs(BonusMalus, 3)[3]  \\\n",
+       "coefficient              3.178178              0.361951              8.231846   \n",
+       "\n",
+       "             np.log(Density)  \n",
+       "coefficient         0.121944  \n",
+       "\n",
+       "[1 rows x 62 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_fun = (\n",
+    "    \"{ClaimAmountCut / Exposure} ~ VehBrand + VehGas + Region + Area\"\n",
+    "    \" + DrivAge + VehAge + VehPower + bs(BonusMalus, 3) + {np.log(Density)}\"\n",
+    ")\n",
+    "\n",
+    "t_glm2 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_fun,\n",
+    ")\n",
+    "t_glm2.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm2.intercept_], t_glm2.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm2.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Miscellaneous Features<a class=\"anchor\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Variable Names\n",
+    "\n",
+    "`glum`'s formula interface provides a lot of control over how the resulting features are named. By default, it follows `formulaic`'s standards, but it can be customized by setting the `interaction_separator` and `categorical_format` paremeters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>DrivAge__0</th>\n",
+       "      <th>DrivAge__1</th>\n",
+       "      <th>DrivAge__2</th>\n",
+       "      <th>DrivAge__3</th>\n",
+       "      <th>DrivAge__4</th>\n",
+       "      <th>DrivAge__5</th>\n",
+       "      <th>DrivAge__6</th>\n",
+       "      <th>VehPower__4</th>\n",
+       "      <th>VehPower__5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DrivAge__4__x__VehPower__8</th>\n",
+       "      <th>DrivAge__5__x__VehPower__8</th>\n",
+       "      <th>DrivAge__6__x__VehPower__8</th>\n",
+       "      <th>DrivAge__0__x__VehPower__9</th>\n",
+       "      <th>DrivAge__1__x__VehPower__9</th>\n",
+       "      <th>DrivAge__2__x__VehPower__9</th>\n",
+       "      <th>DrivAge__3__x__VehPower__9</th>\n",
+       "      <th>DrivAge__4__x__VehPower__9</th>\n",
+       "      <th>DrivAge__5__x__VehPower__9</th>\n",
+       "      <th>DrivAge__6__x__VehPower__9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>5.007277</td>\n",
+       "      <td>1.497079</td>\n",
+       "      <td>0.53565</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.152974</td>\n",
+       "      <td>-0.210998</td>\n",
+       "      <td>-0.205689</td>\n",
+       "      <td>0.017896</td>\n",
+       "      <td>-0.096153</td>\n",
+       "      <td>-0.05484</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.143822</td>\n",
+       "      <td>-0.002094</td>\n",
+       "      <td>0.512258</td>\n",
+       "      <td>0.730534</td>\n",
+       "      <td>-0.280869</td>\n",
+       "      <td>-0.367669</td>\n",
+       "      <td>0.171063</td>\n",
+       "      <td>0.022052</td>\n",
+       "      <td>-0.270456</td>\n",
+       "      <td>0.119634</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
+       "coefficient   5.007277    1.497079     0.53565         0.0   -0.152974   \n",
+       "\n",
+       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
+       "coefficient   -0.210998   -0.205689    0.017896    -0.096153     -0.05484   \n",
+       "\n",
+       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
+       "coefficient  ...                   -0.143822                   -0.002094   \n",
+       "\n",
+       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
+       "coefficient                    0.512258                    0.730534   \n",
+       "\n",
+       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
+       "coefficient                   -0.280869                   -0.367669   \n",
+       "\n",
+       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
+       "coefficient                    0.171063                    0.022052   \n",
+       "\n",
+       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
+       "coefficient                   -0.270456                    0.119634  \n",
+       "\n",
+       "[1 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_name = \"PurePremium ~ DrivAge * VehPower\"\n",
+    "\n",
+    "t_glm5 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_name,\n",
+    "    interaction_separator=\"__x__\",\n",
+    "    categorical_format=\"{name}__{category}\",\n",
+    ")\n",
+    "t_glm5.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm5.intercept_], t_glm5.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm5.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Intercept Term\n",
+    "\n",
+    "Just like in the case of the non-formula interface, an intercept term is added by default. This can be disabled by either setting the `fit_intercept` parameter to `False`, or adding `+0` or `-1` to the end of the formula. In the case of conflict, a warning is emitted, and the latter takes precedence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/stanmart/work/glum/src/glum/_glm.py:2354: UserWarning: The formula explicitly sets the intercept to False, overriding fit_intercept=True.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>DrivAge__0</th>\n",
+       "      <th>DrivAge__1</th>\n",
+       "      <th>DrivAge__2</th>\n",
+       "      <th>DrivAge__3</th>\n",
+       "      <th>DrivAge__4</th>\n",
+       "      <th>DrivAge__5</th>\n",
+       "      <th>DrivAge__6</th>\n",
+       "      <th>VehPower__4</th>\n",
+       "      <th>VehPower__5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DrivAge__4__x__VehPower__8</th>\n",
+       "      <th>DrivAge__5__x__VehPower__8</th>\n",
+       "      <th>DrivAge__6__x__VehPower__8</th>\n",
+       "      <th>DrivAge__0__x__VehPower__9</th>\n",
+       "      <th>DrivAge__1__x__VehPower__9</th>\n",
+       "      <th>DrivAge__2__x__VehPower__9</th>\n",
+       "      <th>DrivAge__3__x__VehPower__9</th>\n",
+       "      <th>DrivAge__4__x__VehPower__9</th>\n",
+       "      <th>DrivAge__5__x__VehPower__9</th>\n",
+       "      <th>DrivAge__6__x__VehPower__9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.713298</td>\n",
+       "      <td>0.783505</td>\n",
+       "      <td>0.205914</td>\n",
+       "      <td>0.016085</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000094</td>\n",
+       "      <td>0.223685</td>\n",
+       "      <td>4.66123</td>\n",
+       "      <td>4.736272</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.144927</td>\n",
+       "      <td>0.001657</td>\n",
+       "      <td>0.515373</td>\n",
+       "      <td>0.714834</td>\n",
+       "      <td>-0.325666</td>\n",
+       "      <td>-0.370935</td>\n",
+       "      <td>0.20417</td>\n",
+       "      <td>0.013222</td>\n",
+       "      <td>-0.273913</td>\n",
+       "      <td>0.115693</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
+       "coefficient        0.0    1.713298    0.783505    0.205914    0.016085   \n",
+       "\n",
+       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
+       "coefficient         0.0    0.000094    0.223685      4.66123     4.736272   \n",
+       "\n",
+       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
+       "coefficient  ...                   -0.144927                    0.001657   \n",
+       "\n",
+       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
+       "coefficient                    0.515373                    0.714834   \n",
+       "\n",
+       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
+       "coefficient                   -0.325666                   -0.370935   \n",
+       "\n",
+       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
+       "coefficient                     0.20417                    0.013222   \n",
+       "\n",
+       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
+       "coefficient                   -0.273913                    0.115693  \n",
+       "\n",
+       "[1 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_noint = \"PurePremium ~ DrivAge * VehPower - 1\"\n",
+    "\n",
+    "t_glm6 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_noint,\n",
+    "    interaction_separator=\"__x__\",\n",
+    "    categorical_format=\"{name}__{category}\",\n",
+    ")\n",
+    "t_glm6.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm6.intercept_], t_glm6.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm6.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### One-Sided Formulas\n",
+    "\n",
+    "Even when using formulas, the outcome variable can be specified as a vector, as in the interface without formulas. In that case the supplied formula should be one-sided (not contain a `~`), and only describe the right-hand side of the regression."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>DrivAge__0</th>\n",
+       "      <th>DrivAge__1</th>\n",
+       "      <th>DrivAge__2</th>\n",
+       "      <th>DrivAge__3</th>\n",
+       "      <th>DrivAge__4</th>\n",
+       "      <th>DrivAge__5</th>\n",
+       "      <th>DrivAge__6</th>\n",
+       "      <th>VehPower__4</th>\n",
+       "      <th>VehPower__5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DrivAge__4__x__VehPower__8</th>\n",
+       "      <th>DrivAge__5__x__VehPower__8</th>\n",
+       "      <th>DrivAge__6__x__VehPower__8</th>\n",
+       "      <th>DrivAge__0__x__VehPower__9</th>\n",
+       "      <th>DrivAge__1__x__VehPower__9</th>\n",
+       "      <th>DrivAge__2__x__VehPower__9</th>\n",
+       "      <th>DrivAge__3__x__VehPower__9</th>\n",
+       "      <th>DrivAge__4__x__VehPower__9</th>\n",
+       "      <th>DrivAge__5__x__VehPower__9</th>\n",
+       "      <th>DrivAge__6__x__VehPower__9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.713298</td>\n",
+       "      <td>0.783505</td>\n",
+       "      <td>0.205914</td>\n",
+       "      <td>0.016085</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000094</td>\n",
+       "      <td>0.223685</td>\n",
+       "      <td>4.66123</td>\n",
+       "      <td>4.736272</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.144927</td>\n",
+       "      <td>0.001657</td>\n",
+       "      <td>0.515373</td>\n",
+       "      <td>0.714834</td>\n",
+       "      <td>-0.325666</td>\n",
+       "      <td>-0.370935</td>\n",
+       "      <td>0.20417</td>\n",
+       "      <td>0.013222</td>\n",
+       "      <td>-0.273913</td>\n",
+       "      <td>0.115693</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
+       "coefficient        0.0    1.713298    0.783505    0.205914    0.016085   \n",
+       "\n",
+       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
+       "coefficient         0.0    0.000094    0.223685      4.66123     4.736272   \n",
+       "\n",
+       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
+       "coefficient  ...                   -0.144927                    0.001657   \n",
+       "\n",
+       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
+       "coefficient                    0.515373                    0.714834   \n",
+       "\n",
+       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
+       "coefficient                   -0.325666                   -0.370935   \n",
+       "\n",
+       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
+       "coefficient                     0.20417                    0.013222   \n",
+       "\n",
+       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
+       "coefficient                   -0.273913                    0.115693  \n",
+       "\n",
+       "[1 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_onesie = \"DrivAge * VehPower\"\n",
+    "\n",
+    "t_glm7 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=False,\n",
+    "    formula=formula_onesie,\n",
+    "    interaction_separator=\"__x__\",\n",
+    "    categorical_format=\"{name}__{category}\",\n",
+    ")\n",
+    "t_glm7.fit(\n",
+    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
+    ")\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm7.intercept_], t_glm7.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm7.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Values in Categorical Columns\n",
+    "\n",
+    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n",
+    "\n",
+    "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[0]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[1]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[2]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[3]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[4]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[5]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[6]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[4]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[5]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[6]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[7]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[8]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[9]</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.786703</td>\n",
+       "      <td>0.742765</td>\n",
+       "      <td>0.239528</td>\n",
+       "      <td>0.096531</td>\n",
+       "      <td>0.071118</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.201078</td>\n",
+       "      <td>4.637267</td>\n",
+       "      <td>4.679391</td>\n",
+       "      <td>4.863387</td>\n",
+       "      <td>4.77263</td>\n",
+       "      <td>4.749673</td>\n",
+       "      <td>4.970188</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(DrivAge, missing_method='zero')[0]  \\\n",
+       "coefficient        0.0                              1.786703   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[1]  \\\n",
+       "coefficient                              0.742765   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[2]  \\\n",
+       "coefficient                              0.239528   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[3]  \\\n",
+       "coefficient                              0.096531   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[4]  \\\n",
+       "coefficient                              0.071118   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[5]  \\\n",
+       "coefficient                                   0.0   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[6]  \\\n",
+       "coefficient                              0.201078   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[4]  \\\n",
+       "coefficient                                  4.637267   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[5]  \\\n",
+       "coefficient                                  4.679391   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[6]  \\\n",
+       "coefficient                                  4.863387   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[7]  \\\n",
+       "coefficient                                   4.77263   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[8]  \\\n",
+       "coefficient                                  4.749673   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[9]  \n",
+       "coefficient                                  4.970188  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n",
+    "\n",
+    "t_glm8 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=False,\n",
+    "    formula=formula_missing,\n",
+    "\n",
+    ")\n",
+    "t_glm8.fit(\n",
+    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
+    ")\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
+    ").T"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "glum-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/tutorials/formula_interface/load_transform_formula.py b/docs/tutorials/formula_interface/load_transform_formula.py
new file mode 100644
index 00000000..d0098f8b
--- /dev/null
+++ b/docs/tutorials/formula_interface/load_transform_formula.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pandas as pd
+
+
+def load_transform():
+    """Load and transform data from OpenML.
+
+    Summary of transformations:
+
+    1. We cut the number of claims to a maximum of 4, as is done in the case study paper
+       (Case-study authors suspect a data error. See section 1 of their paper for details).
+    2. We cut the exposure to a maximum of 1, as is done in the case study paper
+       (Case-study authors suspect a data error. See section 1 of their paper for details).
+    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim
+       (before aggregation per policy). Reason: For large claims, extreme value theory
+       might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of
+       the overall claim amount. This is a well known phenomenon for third-party liability.
+    4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``.
+    5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero.
+    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so
+       they can be used as categoricals later on.
+    """
+    # load the datasets
+    # first row (=column names) uses "", all other rows use ''
+    # use '' as quotechar as it is easier to change column names
+    df = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.arff", quotechar="'"
+    )
+
+    # rename column names '"name"' => 'name'
+    df = df.rename(lambda x: x.replace('"', ""), axis="columns")
+    df["IDpol"] = df["IDpol"].astype(np.int64)
+    df.set_index("IDpol", inplace=True)
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", index_col=0
+    )
+
+    # join ClaimAmount from df_sev to df:
+    #   1. cut ClaimAmount at 100_000
+    #   2. aggregate ClaimAmount per IDpol
+    #   3. join by IDpol
+    df_sev["ClaimAmountCut"] = df_sev["ClaimAmount"].clip(upper=100_000)
+    df = df.join(df_sev.groupby(level=0).sum(), how="left")
+    df.fillna(value={"ClaimAmount": 0, "ClaimAmountCut": 0}, inplace=True)
+
+    # Note: Zero claims must be ignored in severity models,
+    # because the support is (0, inf) not [0, inf).
+    df.loc[(df.ClaimAmount <= 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+    # correct for unreasonable observations (that might be data error)
+    # see case study paper
+    df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+    df["Exposure"] = df["Exposure"].clip(upper=1)
+
+    # Clip and/or digitize predictors into bins
+    df["VehPower"] = np.minimum(df["VehPower"], 9)
+    df["VehAge"] = np.digitize(
+        np.where(df["VehAge"] == 10, 9, df["VehAge"]), bins=[1, 10]
+    )
+    df["DrivAge"] = np.digitize(df["DrivAge"], bins=[21, 26, 31, 41, 51, 71])
+
+    return df
diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst
index 66ce7d45..86166d17 100644
--- a/docs/tutorials/tutorials.rst
+++ b/docs/tutorials/tutorials.rst
@@ -7,3 +7,4 @@ Tutorials
    Poisson, Gamma, and Tweedie with French Motor Third-Party Liability Claims <glm_french_motor_tutorial/glm_french_motor.ipynb>
    High Dimensional Fixed Effects with Rossman Sales <rossman/fixed_effects_rossman.ipynb>
    Regularization with King County Housing Sales <regularization_housing_data/regularization_housing.ipynb>
+   Formula interface <formula_interface/formula_interface.ipynb>
diff --git a/environment.yml b/environment.yml
index 0aa7c240..47c6a174 100644
--- a/environment.yml
+++ b/environment.yml
@@ -13,6 +13,7 @@ dependencies:
   - scikit-learn>=0.23
   - scipy
   - tqdm
+  - formulaic>=0.4
 
   # development tools
   - black
diff --git a/setup.py b/setup.py
index 016f12c3..76652f50 100644
--- a/setup.py
+++ b/setup.py
@@ -85,6 +85,7 @@
         "pandas",
         "scikit-learn>=0.23",
         "scipy",
+        "formulaic>=0.4",
         "tabmat>=4.0.0a",
     ],
     entry_points=None
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index b5ce7095..39085559 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -28,6 +28,8 @@
 import scipy.sparse as sps
 import scipy.sparse.linalg as splinalg
 import tabmat as tm
+from formulaic import Formula, FormulaSpec
+from formulaic.parser import DefaultFormulaParser
 from scipy import linalg, sparse, stats
 from sklearn.base import BaseEstimator, RegressorMixin
 from sklearn.utils import check_array
@@ -238,6 +240,51 @@ def _name_categorical_variables(
     return new_names
 
 
+def _parse_formula(
+    formula: FormulaSpec, include_intercept: bool = True
+) -> Tuple[Optional[Formula], Formula]:
+    """
+    Parse and transform  the formula for use in a GeneralizedLinearRegressor.
+
+    The left-hand side and right-hand side of the formula are separated. If an
+    intercept is present, it is removed from the right-hand side, and a boolean
+    flag is returned to indicate whether or not an intercept should be added to
+    the model.
+
+    Parameters
+    ----------
+    formula : FormulaSpec
+        The formula to parse.
+    include_intercept: bool, default True
+        Whether to include an intercept column if the formula does not
+        include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly.
+
+    Returns
+    -------
+    tuple[Formula, Formula]
+        The left-hand side and right-hand sides of the formula."""
+    if isinstance(formula, str):
+        parser = DefaultFormulaParser(include_intercept=include_intercept)
+        terms = parser.get_terms(formula)
+    elif isinstance(formula, Formula):
+        terms = formula
+    else:
+        raise TypeError("formula must be a string or Formula object.")
+
+    if hasattr(terms, "lhs"):
+        lhs_terms = terms.lhs
+        if len(lhs_terms) != 1:
+            raise ValueError(
+                "formula must have exactly one term on the left-hand side."
+            )
+        rhs_terms = terms.rhs
+    else:
+        lhs_terms = None
+        rhs_terms = terms
+
+    return lhs_terms, rhs_terms
+
+
 def check_bounds(
     bounds: Optional[Union[float, VectorLike]], n_features: int, dtype
 ) -> Optional[np.ndarray]:
@@ -726,6 +773,8 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        formula: Optional[FormulaSpec] = None,
+        interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
     ):
         self.l1_ratio = l1_ratio
@@ -759,6 +808,8 @@ def __init__(
         self.drop_first = drop_first
         self.robust = robust
         self.expected_information = expected_information
+        self.formula = formula
+        self.interaction_separator = interaction_separator
         self.categorical_format = categorical_format
 
     @property
@@ -835,6 +886,10 @@ def _get_start_coef(
 
     def _convert_from_pandas(self, df: pd.DataFrame) -> tm.MatrixBase:
         """Convert a pandas data frame to a tabmat matrix."""
+
+        if hasattr(self, "X_model_spec_"):
+            return self.X_model_spec_.get_model_matrix(df)
+
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
 
@@ -1921,14 +1976,18 @@ def covariance_matrix(
                 "matrix will be incorrect."
             )
 
+        cannot_estimate_cov = X is None or (
+            y is None and not hasattr(self, "y_model_spec_")
+        )
+
         if not skip_checks:
-            if (X is None or y is None) and self.covariance_matrix_ is None:
+            if cannot_estimate_cov and self.covariance_matrix_ is None:
                 raise ValueError(
                     "Either X and y must be provided or the covariance matrix "
                     "must have been previously computed."
                 )
 
-            if (X is None or y is None) and store_covariance_matrix:
+            if cannot_estimate_cov and store_covariance_matrix:
                 raise ValueError(
                     "X and y must be provided if 'store_covariance_matrix' is True."
                 )
@@ -1956,6 +2015,10 @@ def covariance_matrix(
                     )
                 return self.covariance_matrix_
 
+            if hasattr(self, "y_model_spec_"):
+                y = self.y_model_spec_.get_model_matrix(X).A.ravel()
+                # This has to go first because X is modified in the next line
+
             if isinstance(X, pd.DataFrame):
                 X = self._convert_from_pandas(X)
 
@@ -2253,50 +2316,109 @@ def _set_up_and_check_fit_args(
         copy_X = self._should_copy_X()
 
         if isinstance(X, pd.DataFrame):
-            self.feature_dtypes_ = X.dtypes.to_dict()
-
-            if any(X.dtypes == "category"):
-
-                def _expand_categorical_penalties(penalty, X, drop_first):
-                    """
-                    If P1 or P2 has the same shape as X before expanding the
-                    categoricals, we assume that the penalty at the location of
-                    the categorical is the same for all levels.
-                    """
-                    if isinstance(penalty, str):
-                        return penalty
-                    if not sparse.issparse(penalty):
-                        penalty = np.asanyarray(penalty)
-
-                    if penalty.shape[0] == X.shape[1]:
-                        if penalty.ndim == 2:
-                            raise ValueError(
-                                "When the penalty is two dimensional, it has "
-                                "to have the same length as the number of "
-                                "columns of X, after the categoricals "
-                                "have been expanded."
-                            )
-                        return np.array(
-                            list(
-                                chain.from_iterable(
-                                    [elmt for _ in dtype.categories[int(drop_first) :]]
-                                    if pd.api.types.is_categorical_dtype(dtype)
-                                    else [elmt]
-                                    for elmt, dtype in zip(penalty, X.dtypes)
+            if self.formula is not None:
+                lhs, rhs = _parse_formula(
+                    self.formula, include_intercept=self.fit_intercept
+                )
+
+                if lhs is not None:
+                    if y is not None:
+                        raise ValueError(
+                            "`y` is not allowed when using a two-sided formula. "
+                            "Either set `y=None` or use a one-sided formula."
+                        )
+
+                    y = tm.from_formula(
+                        formula=lhs,
+                        data=X,
+                        include_intercept=False,
+                        context=2,
+                    )
+
+                    self.y_model_spec_ = y.model_spec
+                    y = y.A.ravel()
+
+                X = tm.from_formula(
+                    formula=rhs,
+                    data=X,
+                    include_intercept=False,
+                    ensure_full_rank=self.drop_first,
+                    categorical_format=self.categorical_format,
+                    interaction_separator=self.interaction_separator,
+                    add_column_for_intercept=False,
+                    context=2,  # where fit/std_errors/etc. is called from
+                )
+
+                intercept = "1" in X.model_spec.terms
+                if intercept != self.fit_intercept:
+                    warnings.warn(
+                        f"The formula explicitly sets the intercept to {intercept}, "
+                        f"overriding fit_intercept={self.fit_intercept}."
+                    )
+                    self.fit_intercept = intercept
+
+                self.X_model_spec_ = X.model_spec
+
+                self.feature_names_ = list(X.model_spec.column_names)
+                self.term_names_ = list(
+                    chain.from_iterable(
+                        [term] * len(cols) for term, _, cols in X.model_spec.structure
+                    )
+                )
+
+            else:
+                # Maybe TODO: expand categorical penalties with formulas
+
+                self.feature_dtypes_ = X.dtypes.to_dict()
+
+                if any(X.dtypes == "category"):
+
+                    def _expand_categorical_penalties(penalty, X, drop_first):
+                        """
+                        If P1 or P2 has the same shape as X before expanding the
+                        categoricals, we assume that the penalty at the location of
+                        the categorical is the same for all levels.
+                        """
+                        if isinstance(penalty, str):
+                            return penalty
+                        if not sparse.issparse(penalty):
+                            penalty = np.asanyarray(penalty)
+
+                        if penalty.shape[0] == X.shape[1]:
+                            if penalty.ndim == 2:
+                                raise ValueError(
+                                    "When the penalty is two dimensional, it has "
+                                    "to have the same length as the number of "
+                                    "columns of X, after the categoricals "
+                                    "have been expanded."
+                                )
+                            return np.array(
+                                list(
+                                    chain.from_iterable(
+                                        [
+                                            elmt
+                                            for _ in dtype.categories[int(drop_first) :]
+                                        ]
+                                        if pd.api.types.is_categorical_dtype(dtype)
+                                        else [elmt]
+                                        for elmt, dtype in zip(penalty, X.dtypes)
+                                    )
                                 )
                             )
-                        )
-                    else:
-                        return penalty
+                        else:
+                            return penalty
 
-                P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
-                P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
+                    P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
+                    P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
 
-            X = tm.from_pandas(
-                X,
-                drop_first=self.drop_first,
-                categorical_format=self.categorical_format,
-            )
+                X = tm.from_pandas(
+                    X,
+                    drop_first=self.drop_first,
+                    categorical_format=self.categorical_format,
+                )
+
+        if y is None:
+            raise ValueError("y cannot be None when not using a two-sided formula.")
 
         if not self._is_contiguous(X):
             if self.copy_X is not None and not self.copy_X:
@@ -2647,8 +2769,10 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
 
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
-        Set this to True when alpha=0 and solver='auto' to prevent an error due to a singular
-        feature matrix.
+        Set this to True when alpha=0 and solver='auto' to prevent an error due to a
+        singular feature matrix. In the case of using a formula with interactions,
+        setting this argument to ``True`` ensures structural full-rankness (it is
+        equivalent to ``ensure_full_rank`` in formulaic and tabmat).
 
     robust : bool, optional (default = False)
         If true, then robust standard errors are computed by default.
@@ -2656,6 +2780,18 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     expected_information : bool, optional (default = False)
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
+    formula : FormulaSpec
+        A formula accepted by formulaic. It can either be a one-sided formula, in
+        which case ``y`` must be specified in ``fit``, or a two-sided formula, in
+        which case ``y`` must be ``None``.
+
+    interaction_separator: str, default ":"
+        The separator between the names of interacted variables.
+
+    categorical_format: str, default "{name}[T.{category}]"
+        The format string used to generate the names of categorical variables.
+        Has to include the placeholders ``{name}`` and ``{category}``.
+        Only used if ``formula`` is not ``None``.
 
     categorical_features : str, optional (default = "{name}[{category}]")
         Format string for categorical features. The format string should
@@ -2746,6 +2882,8 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        formula: Optional[FormulaSpec] = None,
+        interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
     ):
         self.alphas = alphas
@@ -2782,6 +2920,8 @@ def __init__(
             drop_first=drop_first,
             robust=robust,
             expected_information=expected_information,
+            formula=formula,
+            interaction_separator=interaction_separator,
             categorical_format=categorical_format,
         )
 
@@ -2826,7 +2966,7 @@ def _validate_hyperparameters(self) -> None:
     def fit(
         self,
         X: ArrayLike,
-        y: ArrayLike,
+        y: Optional[ArrayLike] = None,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
         store_covariance_matrix: bool = False,
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 98b0496f..b6e5d28a 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -2,6 +2,7 @@
 from typing import Optional, Union
 
 import numpy as np
+from formulaic import FormulaSpec
 from joblib import Parallel, delayed
 from sklearn.model_selection._split import check_cv
 
@@ -243,6 +244,23 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
 
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
+        Set this to True when alpha=0 and solver='auto' to prevent an error due to a
+        singular feature matrix. In the case of using a formula with interactions,
+        setting this argument to ``True`` ensures structural full-rankness (it is
+        equivalent to ``ensure_full_rank`` in formulaic and tabmat).
+
+    formula : FormulaSpec
+        A formula accepted by formulaic. It can either be a one-sided formula, in
+        which case ``y`` must be specified in ``fit``, or a two-sided formula, in
+        which case ``y`` must be ``None``.
+
+    interaction_separator: str, default ":"
+        The separator between the names of interacted variables.
+
+    categorical_format: str, default "{name}[T.{category}]"
+        The format string used to generate the names of categorical variables.
+        Has to include the placeholders ``{name}`` and ``{category}``.
+        Only used if ``formula`` is not ``None``.
 
     Attributes
     ----------
@@ -323,6 +341,8 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        formula: Optional[FormulaSpec] = None,
+        interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
     ):
         self.alphas = alphas
@@ -359,6 +379,8 @@ def __init__(
             drop_first=drop_first,
             robust=robust,
             expected_information=expected_information,
+            formula=formula,
+            interaction_separator=interaction_separator,
             categorical_format=categorical_format,
         )
 
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index e835bcba..8b714d8b 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -5,11 +5,14 @@
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import formulaic
 import numpy as np
 import pandas as pd
 import pytest
 import statsmodels.api as sm
+import statsmodels.formula.api as smf
 import tabmat as tm
+from formulaic import Formula
 from numpy.testing import assert_allclose
 from scipy import optimize, sparse
 from sklearn.base import clone
@@ -33,7 +36,12 @@
     TweedieDistribution,
     guess_intercept,
 )
-from glum._glm import GeneralizedLinearRegressor, _unstandardize, is_pos_semidef
+from glum._glm import (
+    GeneralizedLinearRegressor,
+    _parse_formula,
+    _unstandardize,
+    is_pos_semidef,
+)
 from glum._link import IdentityLink, Link, LogitLink, LogLink
 
 GLM_SOLVERS = ["irls-ls", "lbfgs", "irls-cd", "trust-constr"]
@@ -2531,6 +2539,50 @@ def test_store_covariance_matrix(
     )
 
 
+@pytest.mark.parametrize(
+    "formula", ["y ~ col_1 + col_2", "col_1 + col_2"], ids=["two-sided", "one-sided"]
+)
+def test_store_covariance_matrix_formula(regression_data, formula):
+    X, y = regression_data
+    df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+
+    if "~" in formula:
+        df["y"] = y
+        y = None
+
+    regressor = GeneralizedLinearRegressor(
+        formula=formula,
+        family="gaussian",
+        alpha=0,
+    )
+    regressor.fit(df, y, store_covariance_matrix=True)
+
+    np.testing.assert_array_almost_equal(
+        regressor.covariance_matrix(df, y),
+        regressor.covariance_matrix(),
+    )
+
+    np.testing.assert_array_almost_equal(
+        regressor.std_errors(df, y),
+        regressor.std_errors(),
+    )
+
+
+def test_store_covariance_matrix_formula_errors(regression_data):
+    X, y = regression_data
+    df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+    formula = "col_1 + col_2"
+
+    regressor = GeneralizedLinearRegressor(
+        formula=formula,
+        family="gaussian",
+        alpha=0,
+    )
+    regressor.fit(df, y)
+    with pytest.raises(ValueError, match="Either X and y must be provided"):
+        regressor.covariance_matrix(df)
+
+
 def test_store_covariance_matrix_errors(regression_data):
     X, y = regression_data
 
@@ -2627,3 +2679,255 @@ def test_store_covariance_matrix_cv(
         new_covariance_matrix,
         stored_covariance_matrix,
     )
+
+
+@pytest.mark.parametrize(
+    "input, expected",
+    [
+        pytest.param(
+            "y ~ x1 + x2",
+            (["y"], ["1", "x1", "x2"]),
+            id="implicit_intercept",
+        ),
+        pytest.param(
+            "y ~ x1 + x2 + 1",
+            (["y"], ["1", "x1", "x2"]),
+            id="explicit_intercept",
+        ),
+        pytest.param(
+            "y ~ x1 + x2 - 1",
+            (["y"], ["x1", "x2"]),
+            id="no_intercept",
+        ),
+        pytest.param(
+            "y ~ ",
+            (["y"], ["1"]),
+            id="empty_rhs",
+        ),
+    ],
+)
+def test_parse_formula(input, expected):
+    lhs_exp, rhs_exp = expected
+    lhs, rhs = _parse_formula(input)
+    assert list(lhs) == lhs_exp
+    assert list(rhs) == rhs_exp
+
+    formula = Formula(input)
+    lhs, rhs = _parse_formula(formula)
+    assert list(lhs) == lhs_exp
+    assert list(rhs) == rhs_exp
+
+
+@pytest.mark.parametrize(
+    "input, error",
+    [
+        pytest.param("y1 + y2 ~ x1 + x2", ValueError, id="multiple_lhs"),
+        pytest.param([["y"], ["x1", "x2"]], TypeError, id="wrong_type"),
+    ],
+)
+def test_parse_formula_invalid(input, error):
+    with pytest.raises(error):
+        _parse_formula(input)
+
+
+@pytest.fixture
+def get_mixed_data():
+    nrow = 10
+    np.random.seed(0)
+    return pd.DataFrame(
+        {
+            "y": np.random.rand(nrow),
+            "x1": np.random.rand(nrow),
+            "x2": np.random.rand(nrow),
+            "c1": np.random.choice(["a", "b", "c"], nrow),
+            "c2": np.random.choice(["d", "e"], nrow),
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    "formula",
+    [
+        pytest.param("y ~ x1 + x2", id="implicit_no_intercept"),
+        pytest.param("y ~ x1 + x2 + 1", id="intercept"),
+        pytest.param("y ~ x1 + x2 - 1", id="no_intercept"),
+        pytest.param("y ~ c1", id="categorical"),
+        pytest.param("y ~ c1 + 1", id="categorical_intercept"),
+        pytest.param("y ~ x1 * c1 * c2", id="interaction"),
+    ],
+)
+@pytest.mark.parametrize(
+    "drop_first", [True, False], ids=["drop_first", "no_drop_first"]
+)
+def test_formula(get_mixed_data, formula, drop_first):
+    data = get_mixed_data
+    y_pd, X_pd = formulaic.model_matrix(
+        formula + " - 1", data, ensure_full_rank=drop_first
+    )
+    y_pd = y_pd.iloc[:, 0]
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=drop_first,
+        formula=formula,
+        fit_intercept=False,
+        categorical_format="{name}[T.{category}]",
+    ).fit(data)
+
+    has_intercept = "1" in model_formula.X_model_spec_.terms
+    model_pandas = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=drop_first,
+        fit_intercept=has_intercept,
+        categorical_format="{name}[T.{category}]",
+    ).fit(X_pd, y_pd)
+
+    np.testing.assert_almost_equal(model_pandas.coef_, model_formula.coef_)
+    np.testing.assert_array_equal(
+        model_pandas.feature_names_, model_formula.feature_names_
+    )
+
+
+@pytest.mark.parametrize(
+    "formula, feature_names, term_names",
+    [
+        pytest.param("y ~ x1 + x2", ["x1", "x2"], ["x1", "x2"], id="numeric"),
+        pytest.param(
+            "y ~ c1", ["c1[T.a]", "c1[T.b]", "c1[T.c]"], 3 * ["c1"], id="categorical"
+        ),
+        pytest.param(
+            "y ~ x1 : c1",
+            ["x1:c1[T.a]", "x1:c1[T.b]", "x1:c1[T.c]"],
+            3 * ["x1:c1"],
+            id="interaction",
+        ),
+        pytest.param(
+            "y ~ poly(x1, 3)",
+            ["poly(x1, 3)[1]", "poly(x1, 3)[2]", "poly(x1, 3)[3]"],
+            3 * ["poly(x1, 3)"],
+            id="function",
+        ),
+    ],
+)
+def test_formula_names_formulaic_style(
+    get_mixed_data, formula, feature_names, term_names
+):
+    data = get_mixed_data
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=False,
+        formula=formula,
+        categorical_format="{name}[T.{category}]",
+        interaction_separator=":",
+    ).fit(data)
+
+    np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
+    np.testing.assert_array_equal(model_formula.term_names_, term_names)
+
+
+@pytest.mark.parametrize(
+    "formula, feature_names, term_names",
+    [
+        pytest.param("y ~ x1 + x2", ["x1", "x2"], ["x1", "x2"], id="numeric"),
+        pytest.param(
+            "y ~ c1", ["c1__a", "c1__b", "c1__c"], 3 * ["c1"], id="categorical"
+        ),
+        pytest.param(
+            "y ~ x1 : c1",
+            ["x1__x__c1__a", "x1__x__c1__b", "x1__x__c1__c"],
+            3 * ["x1:c1"],
+            id="interaction",
+        ),
+        pytest.param(
+            "y ~ poly(x1, 3)",
+            ["poly(x1, 3)[1]", "poly(x1, 3)[2]", "poly(x1, 3)[3]"],
+            3 * ["poly(x1, 3)"],
+            id="function",
+        ),
+    ],
+)
+def test_formula_names_old_glum_style(
+    get_mixed_data, formula, feature_names, term_names
+):
+    data = get_mixed_data
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=False,
+        formula=formula,
+        categorical_format="{name}__{category}",
+        interaction_separator="__x__",
+    ).fit(data)
+
+    np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
+    np.testing.assert_array_equal(model_formula.term_names_, term_names)
+
+
+@pytest.mark.parametrize(
+    "formula",
+    [
+        pytest.param("y ~ x1 + x2", id="implicit_no_intercept"),
+        pytest.param("y ~ x1 + x2 + 1", id="intercept"),
+        pytest.param("y ~ x1 + x2 - 1", id="no_intercept"),
+        pytest.param("y ~ c1", id="categorical"),
+        pytest.param("y ~ c1 + 1", id="categorical_intercept"),
+        pytest.param("y ~ c1 * c2", id="interaction"),
+    ],
+)
+def test_formula_against_smf(get_mixed_data, formula):
+    data = get_mixed_data
+    model_formula = GeneralizedLinearRegressor(
+        family="normal", drop_first=True, formula=formula, alpha=0.0
+    ).fit(data)
+
+    if model_formula.fit_intercept:
+        beta_formula = np.concatenate([[model_formula.intercept_], model_formula.coef_])
+    else:
+        beta_formula = model_formula.coef_
+
+    model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
+
+    np.testing.assert_almost_equal(beta_formula, model_smf.params)
+
+
+def test_formula_context(get_mixed_data):
+    data = get_mixed_data
+    x_context = np.arange(len(data), dtype=float)  # noqa: F841
+    formula = "y ~ x1 + x2 + x_context"
+    model_formula = GeneralizedLinearRegressor(
+        family="normal", drop_first=True, formula=formula, alpha=0.0
+    ).fit(data)
+
+    if model_formula.fit_intercept:
+        beta_formula = np.concatenate([[model_formula.intercept_], model_formula.coef_])
+    else:
+        beta_formula = model_formula.coef_
+
+    model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
+
+    np.testing.assert_almost_equal(beta_formula, model_smf.params)
+
+
+@pytest.mark.parametrize(
+    "formula",
+    [
+        pytest.param("y ~ x1 + x2", id="implicit_no_intercept"),
+        pytest.param("y ~ x1 + x2 + 1", id="intercept"),
+        pytest.param("y ~ x1 + x2 - 1", id="no_intercept"),
+        pytest.param("y ~ c1", id="categorical"),
+        pytest.param("y ~ c1 + 1", id="categorical_intercept"),
+        pytest.param("y ~ c1 * c2", id="interaction"),
+    ],
+)
+def test_formula_predict(get_mixed_data, formula):
+    data = get_mixed_data
+    data_unseen = data.copy()
+    data_unseen.loc[data_unseen["c1"] == "b", "c1"] = "c"
+    model_formula = GeneralizedLinearRegressor(
+        family="normal", drop_first=True, formula=formula, alpha=0.0
+    ).fit(data)
+
+    model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
+
+    yhat_formula = model_formula.predict(data_unseen)
+    yhat_smf = model_smf.predict(data_unseen)
+
+    np.testing.assert_almost_equal(yhat_formula, yhat_smf)

From 003fcec945005693184c594208e1d898ca7fabc6 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Mon, 28 Aug 2023 08:16:34 +0200
Subject: [PATCH 13/63] Formula- and term-based Wald-tests (#689)

* Add formulaic to dependencies

* Add function for transforming the formula

* Add tests

* First draft of glum formula interface

* Fixes and tests

* Handle intercept correctly

* Add formula functionality to glm_cv

* Variables from local context

* Test predict with formulas

* Add formula tutorial

* Fix tutorial

* Reformat tutorial

* Improve function signatures adn docstrings

* Handle two-sided formulas in covariance_matrix

* Make mypy happy about module names

* Matthias' suggestions

* Add back term-based Wald-tests

* Tests for term names

* Add formula-based Wald-test

* Tests for formula-based Wald-test

* Add changelog

* Fix exception message

* Additional test case

* make docstrings clearer in the case of terms
---
 CHANGELOG.rst         |   2 +
 src/glum/_glm.py      | 245 +++++++++++++++++++++++++++++++++++++++++-
 tests/glm/test_glm.py | 216 +++++++++++++++++++++++++++++++++++++
 3 files changed, 460 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f987f137..f5ef680f 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -15,6 +15,7 @@ Changelog
 - Added a formula interface for specifying models.
 - Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
 - Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
+- `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
 
 2.6.0 - UNRELEASED
 ------------------
@@ -33,6 +34,7 @@ Changelog
 **Other changes:**
 
 - When computing the covariance matrix, check for ill-conditionedness for all types of input. Furthermore, do it in a more efficient way.
+- Added the option to specify models using Wilkinson-formulas.
 
 2.5.2 - 2023-06-02
 ------------------
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 39085559..771671a6 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -30,6 +30,7 @@
 import tabmat as tm
 from formulaic import Formula, FormulaSpec
 from formulaic.parser import DefaultFormulaParser
+from formulaic.utils.constraints import LinearConstraintParser
 from scipy import linalg, sparse, stats
 from sklearn.base import BaseEstimator, RegressorMixin
 from sklearn.utils import check_array
@@ -1494,6 +1495,8 @@ def wald_test(
         self,
         R: Optional[np.ndarray] = None,
         features: Optional[Union[str, List[str]]] = None,
+        terms: Optional[Union[str, List[str]]] = None,
+        formula: Optional[str] = None,
         r: Optional[Sequence] = None,
         X=None,
         y=None,
@@ -1512,8 +1515,12 @@ def wald_test(
         - ``R``: The restriction matrix representing the linear combination of
           coefficients to test.
         - ``features``: The name of a feature or a list of features to test.
+        - ``terms``: The name of a term or a list of terms to test.
+        - ``formula``: A formula string specifying the hypothesis to test.
 
-        The right hand side of the tested hypothesis is specified by ``r``.
+        The right hand side of the tested hypothesis is specified by ``r``. In the
+        case of a ``terms``-based test, the null hypothesis is that each coefficient
+        relating to a term is equal to the corresponding value in ``r``.
 
         Parameters
         ----------
@@ -1522,6 +1529,13 @@ def wald_test(
             to test.
         features : Union[str, list[str]], optional, default=None
             The name of a feature or a list of features to test.
+        terms : Union[str, list[str]], optional, default=None
+            The name of a term or a list of terms to test. It can cover one or more
+            coefficients. In the case of a model based on a formula, a term is one
+            of the expressions separated by ``+`` signs. Otherwise, a term is one column
+            in the input data. As categorical variables need not be one-hot encoded in
+            glum, in their case, the hypothesis to be tested is that the coefficients
+            for all of their levels are equal to ``r``.
         r : np.ndarray, optional, default=None
             The vector representing the values of the linear combination.
             If None, the test is for whether the linear combinations of the coefficients
@@ -1557,10 +1571,17 @@ def wald_test(
             NamedTuple with test statistic, p-value, and degrees of freedom.
         """
 
-        num_lhs_specs = sum([R is not None, features is not None])
+        num_lhs_specs = sum(
+            [
+                R is not None,
+                features is not None,
+                terms is not None,
+                formula is not None,
+            ]
+        )
         if num_lhs_specs != 1:
             raise ValueError(
-                "Exactly one of R or features must be specified. "
+                "Exactly one of R, features terms or formula must be specified. "
                 f"Received {num_lhs_specs} specifications."
             )
 
@@ -1594,6 +1615,37 @@ def wald_test(
                 expected_information=expected_information,
             )
 
+        if terms is not None:
+            return self._wald_test_term_names(
+                terms=terms,
+                values=r,
+                X=X,
+                y=y,
+                mu=mu,
+                offset=offset,
+                sample_weight=sample_weight,
+                dispersion=dispersion,
+                robust=robust,
+                clusters=clusters,
+                expected_information=expected_information,
+            )
+
+        if formula is not None:
+            if r is not None:
+                raise ValueError("Cannot specify both formula and r")
+            return self._wald_test_formula(
+                formula=formula,
+                X=X,
+                y=y,
+                mu=mu,
+                offset=offset,
+                sample_weight=sample_weight,
+                dispersion=dispersion,
+                robust=robust,
+                clusters=clusters,
+                expected_information=expected_information,
+            )
+
         raise RuntimeError("This should never happen")
 
     def _wald_test_matrix(
@@ -1796,6 +1848,193 @@ def _wald_test_feature_names(
             expected_information=expected_information,
         )
 
+    def _wald_test_formula(
+        self,
+        formula: str,
+        X=None,
+        y=None,
+        mu=None,
+        offset=None,
+        sample_weight=None,
+        dispersion=None,
+        robust=None,
+        clusters: np.ndarray = None,
+        expected_information=None,
+    ) -> WaldTestResult:
+        """Compute the Wald test statistic and p-value for a linear hypothesis.
+
+        Perform a Wald test for the hypothesis described in ``formula``.
+
+        Parameters
+        ----------
+        formula: str
+            A formula string describing the linear restrictions. For more information,
+            see `meth:ModelSpec.get_linear_constraints` in ``formulaic``.
+        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
+            Training data. Can be omitted if a covariance matrix has already
+            been computed.
+        y : array-like, shape (n_samples,), optional
+            Target values. Can be omitted if a covariance matrix has already
+            been computed.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        sample_weight : array-like, shape (n_samples,), optional, default=None
+            Individual weights for each sample.
+        dispersion : float, optional, default=None
+            The dispersion parameter. Estimated if absent.
+        robust : boolean, optional, default=None
+            Whether to compute robust standard errors instead of normal ones.
+            If not specified, the model's ``robust`` attribute is used.
+        clusters : array-like, optional, default=None
+            Array with cluster membership. Clustered standard errors are
+            computed if clusters is not None.
+        expected_information : boolean, optional, default=None
+            Whether to use the expected or observed information matrix.
+            Only relevant when computing robust standard errors.
+            If not specified, the model's ``expected_information`` attribute is used.
+
+        Returns
+        -------
+        WaldTestResult
+            NamedTuple with test statistic, p-value, and degrees of freedom.
+        """
+
+        if self.fit_intercept:
+            names = ["intercept"] + list(self.feature_names_)
+        else:
+            names = self.feature_names_
+
+        parser = LinearConstraintParser(names)
+
+        R, r = parser.get_matrix(formula)
+
+        return self._wald_test_matrix(
+            R=R,
+            r=r,
+            X=X,
+            y=y,
+            mu=mu,
+            offset=offset,
+            sample_weight=sample_weight,
+            dispersion=dispersion,
+            robust=robust,
+            clusters=clusters,
+            expected_information=expected_information,
+        )
+
+    def _wald_test_term_names(
+        self,
+        terms: Union[str, List[str]],
+        values: Optional[Sequence] = None,
+        X=None,
+        y=None,
+        mu=None,
+        offset=None,
+        sample_weight=None,
+        dispersion=None,
+        robust=None,
+        clusters: np.ndarray = None,
+        expected_information=None,
+    ) -> WaldTestResult:
+        """Compute the Wald test statistic and p-value for a linear hypotheses.
+
+        Perform a Wald test for the hypothesis that the coefficients of the
+        features in ``terms`` are equal to the values in ``terms``.
+
+        Parameters
+        ----------
+        terms : Union[str, list[str]]
+            The name of a term or a list of terms to test. It can cover one or more
+            coefficients. In the case of a model based on a formula, a term is one
+            of the expressions separated by ``+`` signs. Otherwise, a term is one column
+            in the input data. As categorical variables need not be one-hot encoded in
+            glum, in their case, the hypothesis to be tested is that the coefficients
+            for all of their levels are equal to ``r``.
+        values: Sequence, optional, default=None
+            The values to which coefficients are compared. If None, the test is
+            for whether the coefficients are zero.
+        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
+            Training data. Can be omitted if a covariance matrix has already
+            been computed.
+        y : array-like, shape (n_samples,), optional
+            Target values. Can be omitted if a covariance matrix has already
+            been computed.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        sample_weight : array-like, shape (n_samples,), optional (default=None)
+            Individual weights for each sample.
+        dispersion : float, optional, default=None
+            The dispersion parameter. Estimated if absent.
+        robust : boolean, optional, default=None
+            Whether to compute robust standard errors instead of normal ones.
+            If not specified, the model's ``robust`` attribute is used.
+        clusters : array-like, optional, default=None
+            Array with clusters membership. Clustered standard errors are
+            computed if clusters is not None.
+        expected_information : boolean, optional, default=None
+            Whether to use the expected or observed information matrix.
+            Only relevant when computing robust std-errors.
+            If not specified, the model's ``expected_information`` attribute is used.
+
+        Returns
+        -------
+        WaldTestResult
+            NamedTuple with test statistic, p-value and degrees of freedom.
+        """
+
+        if isinstance(terms, str):
+            terms = [terms]
+
+        if values is not None:
+            rhs = True
+            if len(terms) != len(values):
+                raise ValueError("terms and values must have the same length")
+        else:
+            rhs = False
+            values = [None] * len(terms)
+
+        if self.fit_intercept:
+            names = np.array(["intercept"] + list(self.term_names_))
+            beta = np.concatenate([[self.intercept_], self.coef_])
+        else:
+            names = np.array(self.feature_names_)
+            beta = self.coef_
+
+        R_list = []
+        r_list = []
+        for term, value in zip(terms, values):
+            R_indices, *_ = np.where(names == term)
+            num_restrictions = len(R_indices)
+            if num_restrictions == 0:
+                raise ValueError(f"term {term} is not in the model")
+            R_current = np.zeros((num_restrictions, len(beta)), dtype=np.float64)
+            R_current[np.arange(num_restrictions), R_indices] = 1.0
+            R_list.append(R_current)
+
+            if rhs:
+                r_list.append(np.full(num_restrictions, fill_value=value))
+
+        R = np.vstack(R_list)
+        r = np.concatenate(r_list) if rhs else None
+
+        return self._wald_test_matrix(
+            R=R,
+            r=r,
+            X=X,
+            y=y,
+            mu=mu,
+            offset=offset,
+            sample_weight=sample_weight,
+            dispersion=dispersion,
+            robust=robust,
+            clusters=clusters,
+            expected_information=expected_information,
+        )
+
     def std_errors(
         self,
         X=None,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 8b714d8b..e9e97ab7 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2364,6 +2364,222 @@ def test_wald_test_feature_names_public(regression_data, names, r):
     assert mdl._wald_test_feature_names(names, r) == mdl.wald_test(features=names, r=r)
 
 
+@pytest.mark.parametrize(
+    "names, R, r, r_feat",
+    [
+        pytest.param(["col_1"], np.array([[0, 1] + 5 * [0]]), None, None, id="single"),
+        pytest.param(
+            ["col_1", "col_2"],
+            np.array([[0, 1, 0] + 4 * [0], [0, 0, 1] + 4 * [0]]),
+            None,
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            None,
+            None,
+            id="multifeature",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            [1],
+            [1] * 4,
+            id="rhs_not_zero",
+        ),
+        pytest.param(
+            ["intercept", "col_1"],
+            np.array([[1, 0] + 5 * [0], [0, 1] + 5 * [0]]),
+            [1, 2],
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_term_names(regression_data, names, R, r, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+    X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
+
+    mdl = GeneralizedLinearRegressor(
+        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl._wald_test_term_names(names, r)
+
+    if r is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
+@pytest.mark.parametrize(
+    "names, R, r, r_feat",
+    [
+        pytest.param(["col_1"], np.array([[0, 1] + 5 * [0]]), None, None, id="single"),
+        pytest.param(
+            ["col_1", "col_2"],
+            np.array([[0, 1, 0] + 4 * [0], [0, 0, 1] + 4 * [0]]),
+            None,
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            None,
+            None,
+            id="multifeature",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            [1],
+            [1] * 4,
+            id="rhs_not_zero",
+        ),
+        pytest.param(
+            ["intercept", "col_1"],
+            np.array([[1, 0] + 5 * [0], [0, 1] + 5 * [0]]),
+            [1, 2],
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_term_names_public(regression_data, names, R, r, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+    X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
+
+    mdl = GeneralizedLinearRegressor(
+        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl.wald_test(terms=names, r=r)
+
+    if r is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
+@pytest.mark.parametrize(
+    "formula, R, r_feat",
+    [
+        pytest.param("col_0 = 0", np.array([[0, 1] + 9 * [0]]), None, id="single"),
+        pytest.param(
+            "col_0 = 0, col_1 = 0",
+            np.array([[0, 1, 0] + 8 * [0], [0, 0, 1] + 8 * [0]]),
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            "intercept = 1, col_0 = 2",
+            np.array([[1, 0] + 9 * [0], [0, 1] + 9 * [0]]),
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_formula(regression_data, formula, R, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+
+    mdl = GeneralizedLinearRegressor(
+        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl._wald_test_formula(formula)
+
+    if r_feat is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
+@pytest.mark.parametrize(
+    "formula, R, r_feat",
+    [
+        pytest.param("col_0 = 0", np.array([[0, 1] + 9 * [0]]), None, id="single"),
+        pytest.param(
+            "col_0 = 0, col_1 = 0",
+            np.array([[0, 1, 0] + 8 * [0], [0, 0, 1] + 8 * [0]]),
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            "col_0 + col_1 = 2 * col_2 - 1",
+            np.array([[0, 1, 1, -2] + 7 * [0]]),
+            [-1],
+            id="combination",
+        ),
+        pytest.param(
+            "intercept = 1, col_0 = 2",
+            np.array([[1, 0] + 9 * [0], [0, 1] + 9 * [0]]),
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_formula_public(regression_data, formula, R, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+
+    mdl = GeneralizedLinearRegressor(
+        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl.wald_test(formula=formula)
+
+    if r_feat is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
 def test_wald_test_raise_on_wrong_input(regression_data):
     X, y = regression_data
     mdl = GeneralizedLinearRegressor(alpha=0, family="gaussian", fit_intercept=True)

From 91e0408cac215aebe95b713f15a6c6f92468c50f Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Mon, 28 Aug 2023 08:26:13 +0200
Subject: [PATCH 14/63] Support for missing values in categorical columns
 (#684)

* Delegate column naming to tabmat

* Add tests

* More tests

* Test for dropping complete categories

* Add docstrings for new argument

* Add changelog entry

* Convert to pandas at the correct place

* Reorganize converting from pandas

* Remove xfail from test

* Implement missing categorical support

* Add test

* Solve adding missing category when predicting

* Apply Matthias' suggestions

* Add changelog entry
---
 CHANGELOG.rst           |  1 +
 src/glum/_glm.py        | 34 ++++++++++++++++-
 src/glum/_glm_cv.py     | 17 ++++++++-
 src/glum/_util.py       | 40 +++++++++++++++++++-
 tests/glm/test_glm.py   | 40 ++++++++++++++++++++
 tests/glm/test_utils.py | 81 ++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 208 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f5ef680f..30f1dc8b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -15,6 +15,7 @@ Changelog
 - Added a formula interface for specifying models.
 - Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
 - Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
+- Added more options for treating missing values in categorical columns. They can either raise a `ValueError` (`"fail"`), be treated as all-zero indicators (`"zero"`) or represented as a new category (`"convert"`).
 - `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
 
 2.6.0 - UNRELEASED
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 771671a6..80738121 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -63,7 +63,7 @@
     _least_squares_solver,
     _trust_constr_solver,
 )
-from ._util import _align_df_categories, _safe_toarray
+from ._util import _add_missing_categories, _align_df_categories, _safe_toarray
 
 _float_itemsize_to_dtype = {8: np.float64, 4: np.float32, 2: np.float16}
 
@@ -777,6 +777,8 @@ def __init__(
         formula: Optional[FormulaSpec] = None,
         interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -812,6 +814,8 @@ def __init__(
         self.formula = formula
         self.interaction_separator = interaction_separator
         self.categorical_format = categorical_format
+        self.cat_missing_method = cat_missing_method
+        self.cat_missing_name = cat_missing_name
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -893,11 +897,20 @@ def _convert_from_pandas(self, df: pd.DataFrame) -> tm.MatrixBase:
 
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
+            if self.cat_missing_method == "convert":
+                df = _add_missing_categories(
+                    df=df,
+                    dtypes=self.feature_dtypes_,
+                    feature_names=self.feature_names_,
+                    cat_missing_name=self.cat_missing_name,
+                    categorical_format=self.categorical_format,
+                )
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
+            cat_missing_method=self.cat_missing_method,
         )
 
         return X
@@ -2654,6 +2667,8 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                     X,
                     drop_first=self.drop_first,
                     categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
+                    cat_missing_name=self.cat_missing_name,
                 )
 
         if y is None:
@@ -3032,12 +3047,23 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         Has to include the placeholders ``{name}`` and ``{category}``.
         Only used if ``formula`` is not ``None``.
 
-    categorical_features : str, optional (default = "{name}[{category}]")
+    categorical_format : str, optional, default='{name}[{category}]'
         Format string for categorical features. The format string should
         contain the placeholder ``{name}`` for the feature name and
         ``{category}`` for the category name. Only used if ``X`` is a pandas
         DataFrame.
 
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
+        How to handle missing values in categorical columns. Only used if ``X``
+        is a pandas data frame.
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the ``cat_missing_name``
+          category.
+    cat_missing_name: str, default='(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -3124,6 +3150,8 @@ def __init__(
         formula: Optional[FormulaSpec] = None,
         interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -3162,6 +3190,8 @@ def __init__(
             formula=formula,
             interaction_separator=interaction_separator,
             categorical_format=categorical_format,
+            cat_missing_method=cat_missing_method,
+            cat_missing_name=cat_missing_name,
         )
 
     def _validate_hyperparameters(self) -> None:
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index b6e5d28a..f9706088 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -299,11 +299,22 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
 
-    categorical_features : str, optional (default = "{name}[{category}]")
+    categorical_format : str, optional (default = "{name}[{category}]")
         Format string for categorical features. The format string should
         contain the placeholder ``{name}`` for the feature name and
         ``{category}`` for the category name. Only used if ``X`` is a pandas
         DataFrame.
+
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
+        How to handle missing values in categorical columns. Only used if ``X``
+        is a pandas data frame.
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the ``cat_missing_name``
+          category.
+    cat_missing_name: str, default='(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
     """
 
     def __init__(
@@ -344,6 +355,8 @@ def __init__(
         formula: Optional[FormulaSpec] = None,
         interaction_separator: str = ":",
         categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.alphas = alphas
         self.cv = cv
@@ -382,6 +395,8 @@ def __init__(
             formula=formula,
             interaction_separator=interaction_separator,
             categorical_format=categorical_format,
+            cat_missing_method=cat_missing_method,
+            cat_missing_name=cat_missing_name,
         )
 
     def _validate_hyperparameters(self) -> None:
diff --git a/src/glum/_util.py b/src/glum/_util.py
index aabf7b8a..b8592cbf 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Union
+from typing import Sequence, Union
 
 import numpy as np
 import pandas as pd
@@ -53,6 +53,44 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     return df
 
 
+def _add_missing_categories(
+    df,
+    dtypes,
+    feature_names: Sequence[str],
+    categorical_format: str,
+    cat_missing_name: str,
+) -> pd.DataFrame:
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
+
+    changed_dtypes = {}
+
+    categorical_dtypes = [
+        column
+        for column, dtype in dtypes.items()
+        if pd.api.types.is_categorical_dtype(dtype) and (column in df)
+    ]
+
+    for column in categorical_dtypes:
+        if (
+            categorical_format.format(name=column, category=cat_missing_name)
+            in feature_names
+        ):
+            if cat_missing_name in df[column].cat.categories:
+                raise ValueError(
+                    f"Missing category {cat_missing_name} already exists in {column}."
+                )
+            _logger.info(f"Adding missing category {cat_missing_name} to {column}.")
+            changed_dtypes[column] = df[column].cat.add_categories(cat_missing_name)
+            if df[column].isnull().any():
+                changed_dtypes[column] = changed_dtypes[column].fillna(cat_missing_name)
+
+    if changed_dtypes:
+        df = df.assign(**changed_dtypes)
+
+    return df
+
+
 def _safe_lin_pred(
     X: Union[MatrixBase, StandardizedMatrix],
     coef: np.ndarray,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index e9e97ab7..19f20b1d 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3147,3 +3147,43 @@ def test_formula_predict(get_mixed_data, formula):
     yhat_smf = model_smf.predict(data_unseen)
 
     np.testing.assert_almost_equal(yhat_formula, yhat_smf)
+
+
+@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
+def test_cat_missing(cat_missing_method):
+    X = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
+            "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
+        }
+    )
+    X_unseen = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, pd.NA]),
+            "cat_2": pd.Categorical([1, 2]),
+        }
+    )
+    y = np.array([1, 2, 3, 4, 5])
+
+    model = GeneralizedLinearRegressor(
+        family="normal",
+        cat_missing_method=cat_missing_method,
+        drop_first=False,
+        fit_intercept=False,
+    )
+
+    if cat_missing_method == "fail":
+        with pytest.raises(ValueError):
+            model.fit(X, y)
+    else:
+        model.fit(X, y)
+        feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
+
+        if cat_missing_method == "convert":
+            feature_names.insert(2, "cat_1[(MISSING)]")
+            feature_names.append("cat_2[(MISSING)]")
+
+        np.testing.assert_array_equal(model.feature_names_, feature_names)
+        assert len(model.coef_) == len(feature_names)
+
+        model.predict(X_unseen)
diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py
index 1960b848..36cf988a 100644
--- a/tests/glm/test_utils.py
+++ b/tests/glm/test_utils.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from glum._util import _align_df_categories
+from glum._util import _add_missing_categories, _align_df_categories
 
 
 @pytest.fixture()
@@ -96,3 +96,82 @@ def test_align_df_categories_missing_columns(df):
 def test_align_df_categories_not_df():
     with pytest.raises(TypeError):
         _align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
+
+
+@pytest.fixture()
+def df_na():
+    return pd.DataFrame(
+        {
+            "num": np.array([0, 1], dtype="float64"),
+            "cat": pd.Categorical(["a", "b"]),
+            "cat_na": pd.Categorical(["a", pd.NA]),
+            "cat2": pd.Categorical(["a", "b"]),
+        }
+    )
+
+
+def test_add_missing_categories(df_na):
+    categorical_format = "{name}[{category}]"
+    cat_missing_name = "(M)"
+    dtypes = df_na.dtypes
+    feature_names = [
+        "num",
+        "num[(M)]",
+        "cat[a]",
+        "cat[b]",
+        "cat[(M)]",
+        "cat_na[a]",
+        "cat_na[(M)]",
+        "cat2[a]",
+        "cat2[b]",
+    ]
+
+    expected = pd.DataFrame(
+        {
+            "num": np.array([0, 1], dtype="float64"),
+            "cat": pd.Categorical(["a", "b"], categories=["a", "b", "(M)"]),
+            "cat_na": pd.Categorical(["a", "(M)"], categories=["a", "(M)"]),
+            "cat2": pd.Categorical(["a", "b"], categories=["a", "b"]),
+        }
+    )
+
+    pd.testing.assert_frame_equal(
+        _add_missing_categories(
+            df=df_na,
+            dtypes=dtypes,
+            feature_names=feature_names,
+            categorical_format=categorical_format,
+            cat_missing_name=cat_missing_name,
+        ),
+        expected,
+    )
+
+
+def test_raise_on_existing_missing(df_na):
+    categorical_format = "{name}[{category}]"
+    cat_missing_name = "(M)"
+    dtypes = df_na.dtypes
+    feature_names = [
+        "num",
+        "num[(M)]",
+        "cat[a]",
+        "cat[b]",
+        "cat[(M)]",
+        "cat_na[a]",
+        "cat_na[(M)]",
+        "cat2[a]",
+        "cat2[b]",
+    ]
+
+    df = df_na
+    df["cat_na"] = df["cat_na"].cat.add_categories("(M)")
+    df.loc[df.cat_na.isna(), "cat_na"] = "(M)"
+
+    with pytest.raises(ValueError):
+        _add_missing_categories(
+            df=df,
+            dtypes=dtypes,
+            feature_names=feature_names,
+            categorical_format=categorical_format,
+            cat_missing_name=cat_missing_name,
+        )

From 44c9cd9ef05816047d0a4badcc0548ad9ff75052 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Fri, 13 Oct 2023 19:34:06 +0200
Subject: [PATCH 15/63] Fix formula context (#691)

* Make tests fail

* Propagate context through methods
---
 src/glum/_glm.py      | 209 ++++++++++++++++++++++++++++++++++--------
 src/glum/_glm_cv.py   |   9 +-
 tests/glm/test_glm.py |   1 +
 3 files changed, 178 insertions(+), 41 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 80738121..b0077b6e 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -21,7 +21,17 @@
 import warnings
 from collections.abc import Iterable
 from itertools import chain
-from typing import Any, List, NamedTuple, Optional, Sequence, Tuple, Union, cast
+from typing import (
+    Any,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 
 import numpy as np
 import pandas as pd
@@ -31,6 +41,7 @@
 from formulaic import Formula, FormulaSpec
 from formulaic.parser import DefaultFormulaParser
 from formulaic.utils.constraints import LinearConstraintParser
+from formulaic.utils.context import capture_context
 from scipy import linalg, sparse, stats
 from sklearn.base import BaseEstimator, RegressorMixin
 from sklearn.utils import check_array
@@ -227,20 +238,6 @@ def _check_offset(
     return offset
 
 
-def _name_categorical_variables(
-    categories: Tuple[str], column_name: str, drop_first: bool
-):
-    new_names = [
-        f"{column_name}__{category}" for category in categories[int(drop_first) :]
-    ]
-    if len(new_names) == 0:
-        raise ValueError(
-            f"Categorical column: {column_name}, contains only one category. "
-            + "This should be dropped from the feature matrix."
-        )
-    return new_names
-
-
 def _parse_formula(
     formula: FormulaSpec, include_intercept: bool = True
 ) -> Tuple[Optional[Formula], Formula]:
@@ -889,11 +886,13 @@ def _get_start_coef(
 
         return coef
 
-    def _convert_from_pandas(self, df: pd.DataFrame) -> tm.MatrixBase:
+    def _convert_from_pandas(
+        self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
+    ) -> tm.MatrixBase:
         """Convert a pandas data frame to a tabmat matrix."""
 
         if hasattr(self, "X_model_spec_"):
-            return self.X_model_spec_.get_model_matrix(df)
+            return self.X_model_spec_.get_model_matrix(df, context=context)
 
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(df, self.feature_dtypes_)
@@ -1287,6 +1286,7 @@ def linear_predictor(
         offset: Optional[ArrayLike] = None,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Compute the linear predictor, ``X * coef_ + intercept_``.
 
@@ -1311,6 +1311,10 @@ def linear_predictor(
             Sets the alpha(s) to use in case ``alpha_search`` is ``True``.
             Incompatible with ``alpha_index`` (see above).
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         Returns
         -------
         array, shape (n_samples, n_alphas)
@@ -1326,7 +1330,10 @@ def linear_predictor(
             alpha_index = [self._find_alpha_index(a) for a in alpha]  # type: ignore
 
         if isinstance(X, pd.DataFrame):
-            X = self._convert_from_pandas(X)
+            captured_context = capture_context(
+                context + 1 if isinstance(context, int) else context
+            )
+            X = self._convert_from_pandas(X, context=captured_context)
 
         X = check_array_tabmat_compliant(
             X,
@@ -1366,6 +1373,7 @@ def predict(
         offset: Optional[ArrayLike] = None,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Predict using GLM with feature matrix ``X``.
 
@@ -1393,13 +1401,20 @@ def predict(
             Sets the alpha(s) to use in case ``alpha_search`` is ``True``.
             Incompatible with ``alpha_index`` (see above).
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         Returns
         -------
         array, shape (n_samples, n_alphas)
             Predicted values times ``sample_weight``.
         """
         if isinstance(X, pd.DataFrame):
-            X = self._convert_from_pandas(X)
+            captured_context = capture_context(
+                context + 1 if isinstance(context, int) else context
+            )
+            X = self._convert_from_pandas(X, context=captured_context)
 
         eta = self.linear_predictor(
             X, offset=offset, alpha_index=alpha_index, alpha=alpha
@@ -1424,6 +1439,7 @@ def coef_table(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Get a table of of the regression coefficients.
 
@@ -1458,7 +1474,10 @@ def coef_table(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         Returns
         -------
         pandas.DataFrame
@@ -1472,6 +1491,10 @@ def coef_table(
             names = self.feature_names_
             beta = self.coef_
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         covariance_matrix = self.covariance_matrix(
             X=X,
             y=y,
@@ -1482,6 +1505,7 @@ def coef_table(
             robust=robust,
             clusters=clusters,
             expected_information=expected_information,
+            context=captured_context,
         )
 
         significance_level = 1 - confidence_level
@@ -1520,6 +1544,7 @@ def wald_test(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypothesis.
 
@@ -1577,7 +1602,10 @@ def wald_test(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         Returns
         -------
         WaldTestResult
@@ -1598,6 +1626,10 @@ def wald_test(
                 f"Received {num_lhs_specs} specifications."
             )
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         if R is not None:
             return self._wald_test_matrix(
                 R=R,
@@ -1611,6 +1643,7 @@ def wald_test(
                 robust=robust,
                 clusters=clusters,
                 expected_information=expected_information,
+                context=captured_context,
             )
 
         if features is not None:
@@ -1626,6 +1659,7 @@ def wald_test(
                 robust=robust,
                 clusters=clusters,
                 expected_information=expected_information,
+                context=captured_context,
             )
 
         if terms is not None:
@@ -1641,6 +1675,7 @@ def wald_test(
                 robust=robust,
                 clusters=clusters,
                 expected_information=expected_information,
+                context=captured_context,
             )
 
         if formula is not None:
@@ -1657,6 +1692,7 @@ def wald_test(
                 robust=robust,
                 clusters=clusters,
                 expected_information=expected_information,
+                context=captured_context,
             )
 
         raise RuntimeError("This should never happen")
@@ -1674,6 +1710,7 @@ def _wald_test_matrix(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypothesis.
 
@@ -1714,7 +1751,8 @@ def _wald_test_matrix(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-
+        context : Optional[Mapping[str, Any]], default=None
+            The context to use for evaluating the formula.
         Returns
         -------
         WaldTestResult
@@ -1731,6 +1769,7 @@ def _wald_test_matrix(
             robust=robust,
             clusters=clusters,
             expected_information=expected_information,
+            context=context,
         )
 
         if self.fit_intercept:
@@ -1778,6 +1817,7 @@ def _wald_test_feature_names(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypothesis.
 
@@ -1815,7 +1855,8 @@ def _wald_test_feature_names(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-
+        context : Optional[Mapping[str, Any]], default=None
+            The context to use for evaluating the formula.
         Returns
         -------
         WaldTestResult
@@ -1859,6 +1900,7 @@ def _wald_test_feature_names(
             robust=robust,
             clusters=clusters,
             expected_information=expected_information,
+            context=context,
         )
 
     def _wald_test_formula(
@@ -1873,6 +1915,7 @@ def _wald_test_formula(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypothesis.
 
@@ -1907,7 +1950,8 @@ def _wald_test_formula(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-
+        context : Optional[Mapping[str, Any]], default=None
+            The context to use for evaluating the formula.
         Returns
         -------
         WaldTestResult
@@ -1935,6 +1979,7 @@ def _wald_test_formula(
             robust=robust,
             clusters=clusters,
             expected_information=expected_information,
+            context=context,
         )
 
     def _wald_test_term_names(
@@ -1950,6 +1995,7 @@ def _wald_test_term_names(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypotheses.
 
@@ -1992,7 +2038,8 @@ def _wald_test_term_names(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust std-errors.
             If not specified, the model's ``expected_information`` attribute is used.
-
+        context : Optional[Mapping[str, Any]], default=None
+            The context to use for evaluating the formula.
         Returns
         -------
         WaldTestResult
@@ -2046,6 +2093,7 @@ def _wald_test_term_names(
             robust=robust,
             clusters=clusters,
             expected_information=expected_information,
+            context=context,
         )
 
     def std_errors(
@@ -2060,6 +2108,7 @@ def std_errors(
         clusters: np.ndarray = None,
         expected_information=None,
         store_covariance_matrix=False,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Calculate standard errors for generalized linear models.
 
@@ -2095,7 +2144,14 @@ def std_errors(
         store_covariance_matrix : boolean, optional, default=False
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.
-        """
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly."""
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         return np.sqrt(
             self.covariance_matrix(
                 X=X,
@@ -2108,6 +2164,7 @@ def std_errors(
                 clusters=clusters,
                 expected_information=expected_information,
                 store_covariance_matrix=store_covariance_matrix,
+                context=captured_context,
             ).diagonal()
         )
 
@@ -2124,6 +2181,7 @@ def covariance_matrix(
         expected_information=None,
         store_covariance_matrix=False,
         skip_checks=False,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Calculate the covariance matrix for generalized linear models.
 
@@ -2158,7 +2216,10 @@ def covariance_matrix(
             If a covariance matrix has already been stored, it will be overwritten.
         skip_checks : boolean, optional, default=False
             Whether to skip input validation. For internal use only.
-
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         Notes
         -----
         We support three types of covariance matrices:
@@ -2202,6 +2263,10 @@ def covariance_matrix(
         """
         self.covariance_matrix_: Union[np.ndarray, None]
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         if robust is None:
             _robust = self.robust
         else:
@@ -2272,7 +2337,7 @@ def covariance_matrix(
                 # This has to go first because X is modified in the next line
 
             if isinstance(X, pd.DataFrame):
-                X = self._convert_from_pandas(X)
+                X = self._convert_from_pandas(X, context=captured_context)
 
             X, y = check_X_y_tabmat_compliant(
                 X,
@@ -2388,6 +2453,7 @@ def score(
         y: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Compute :math:`D^2`, the percentage of deviance explained.
 
@@ -2414,6 +2480,10 @@ def score(
 
         offset : array-like, shape (n_samples,), optional (default=None)
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         Returns
         -------
         float
@@ -2422,8 +2492,12 @@ def score(
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         sample_weight = _check_weights(sample_weight, y.shape[0], y.dtype)
-        mu = self.predict(X, offset=offset)
+        mu = self.predict(X, offset=offset, context=captured_context)
         family = get_family(self.family)
         dev = family.deviance(y, mu, sample_weight=sample_weight)
         y_mean = np.average(y, weights=sample_weight)
@@ -2547,6 +2621,7 @@ def _set_up_and_check_fit_args(
         offset: Optional[VectorLike],
         solver: str,
         force_all_finite,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> Tuple[
         tm.MatrixBase,
         np.ndarray,
@@ -2584,7 +2659,7 @@ def _set_up_and_check_fit_args(
                         formula=lhs,
                         data=X,
                         include_intercept=False,
-                        context=2,
+                        context=context,
                     )
 
                     self.y_model_spec_ = y.model_spec
@@ -2598,7 +2673,7 @@ def _set_up_and_check_fit_args(
                     categorical_format=self.categorical_format,
                     interaction_separator=self.interaction_separator,
                     add_column_for_intercept=False,
-                    context=2,  # where fit/std_errors/etc. is called from
+                    context=context,
                 )
 
                 intercept = "1" in X.model_spec.terms
@@ -3242,6 +3317,7 @@ def fit(
         clusters: Optional[np.ndarray] = None,
         # TODO: take out weights_sum (or use it properly)
         weights_sum: Optional[float] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Fit a Generalized Linear Model.
 
@@ -3284,6 +3360,11 @@ def fit(
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
+
         weights_sum: float, optional (default=None)
 
         Returns
@@ -3293,6 +3374,10 @@ def fit(
 
         self._validate_hyperparameters()
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         # NOTE: This function checks if all the entries in X and y are
         # finite. That can be expensive. But probably worthwhile.
         (
@@ -3310,6 +3395,7 @@ def fit(
             offset,
             solver=self.solver,
             force_all_finite=self.force_all_finite,
+            context=captured_context,
         )
         assert isinstance(X, tm.MatrixBase)
         assert isinstance(y, np.ndarray)
@@ -3500,6 +3586,7 @@ def _compute_information_criteria(
         X: ShapedArrayLike,
         y: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Mapping[str, Any]] = None,
     ):
         """
         Computes and stores the model's degrees of freedom, the 'aic', 'aicc'
@@ -3552,7 +3639,7 @@ def _compute_information_criteria(
                 + "criteria"
             )
 
-        mu = self.predict(X)
+        mu = self.predict(X, context=context)
         ll = self.family_instance.log_likelihood(y, mu, sample_weight=sample_weight)
 
         aic = -2 * ll + 2 * k_params
@@ -3567,7 +3654,11 @@ def _compute_information_criteria(
         return True
 
     def aic(
-        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """
         Akaike's information criteria. Computed as:
@@ -3587,11 +3678,25 @@ def aic(
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
+
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         """
-        return self._get_info_criteria("aic", X, y, sample_weight)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+        return self._get_info_criteria(
+            "aic", X, y, sample_weight, context=captured_context
+        )
 
     def aicc(
-        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """
         Second-order Akaike's information criteria (or small sample AIC).
@@ -3613,8 +3718,18 @@ def aicc(
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
+
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly.
         """
-        aicc = self._get_info_criteria("aicc", X, y, sample_weight)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+        aicc = self._get_info_criteria(
+            "aicc", X, y, sample_weight, context=captured_context
+        )
         if not aicc:
             raise ValueError(
                 "Model degrees of freedom should be more than training datapoints."
@@ -3622,7 +3737,11 @@ def aicc(
         return aicc
 
     def bic(
-        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """
         Bayesian information criterion. Computed as:
@@ -3643,8 +3762,17 @@ def bic(
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
-        """
-        return self._get_info_criteria("bic", X, y, sample_weight)
+
+        context : Optional[Union[int, Mapping[str, Any]]], default=0
+            The context to use for evaluating the formula. If an integer, the
+            context is taken from the stack frame of the caller at the given
+            depth. If a dict, it is used as the context directly."""
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+        return self._get_info_criteria(
+            "bic", X, y, sample_weight, context=captured_context
+        )
 
     def _get_info_criteria(
         self,
@@ -3652,11 +3780,12 @@ def _get_info_criteria(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Mapping[str, Any]] = None,
     ):
         check_is_fitted(self, "coef_")
 
         if not hasattr(self, "_info_criteria"):
-            self._compute_information_criteria(X, y, sample_weight)
+            self._compute_information_criteria(X, y, sample_weight, context=context)
 
         if (
             self.alpha is None or (self.alpha is not None and self.alpha > 0)
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index f9706088..06a63df9 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -1,8 +1,9 @@
 import copy
-from typing import Optional, Union
+from typing import Any, Mapping, Optional, Union
 
 import numpy as np
 from formulaic import FormulaSpec
+from formulaic.utils.context import capture_context
 from joblib import Parallel, delayed
 from sklearn.model_selection._split import check_cv
 
@@ -423,6 +424,7 @@ def fit(
         offset: Optional[ArrayLike] = None,
         store_covariance_matrix: bool = False,
         clusters: Optional[np.ndarray] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         r"""
         Choose the best model along a 'regularization path' by cross-validation.
@@ -468,6 +470,10 @@ def fit(
         """
         self._validate_hyperparameters()
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         (
             X,
             y,
@@ -483,6 +489,7 @@ def fit(
             offset,
             solver=self.solver,
             force_all_finite=self.force_all_finite,
+            context=captured_context,
         )
 
         #########
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 19f20b1d..b10cc904 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3120,6 +3120,7 @@ def test_formula_context(get_mixed_data):
     model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
 
     np.testing.assert_almost_equal(beta_formula, model_smf.params)
+    np.testing.assert_almost_equal(model_formula.predict(data), model_smf.predict(data))
 
 
 @pytest.mark.parametrize(

From dd1a2e80c4c8931140d1d294661e5e1cfc0d207e Mon Sep 17 00:00:00 2001
From: Marc-Antoine Schmidt <marc-antoine.schmidt@quantco.com>
Date: Fri, 13 Oct 2023 14:07:37 -0400
Subject: [PATCH 16/63] pyupgrade

---
 src/glum/_glm_cv.py | 3 ++-
 src/glum/_util.py   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 06a63df9..c3b9f378 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -1,5 +1,6 @@
 import copy
-from typing import Any, Mapping, Optional, Union
+from collections.abc import Mapping
+from typing import Any, Optional, Union
 
 import numpy as np
 from formulaic import FormulaSpec
diff --git a/src/glum/_util.py b/src/glum/_util.py
index b8592cbf..faec164e 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Sequence, Union
+from collections.abc import Sequence
+from typing import Union
 
 import numpy as np
 import pandas as pd

From fe30a8e4eaae66ee34330dbd0f341e912aaa3d95 Mon Sep 17 00:00:00 2001
From: Marc-Antoine Schmidt <marc-antoine.schmidt@quantco.com>
Date: Mon, 16 Oct 2023 09:31:30 -0400
Subject: [PATCH 17/63] ensure_full_rank != drop_first

---
 tests/glm/test_glm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index e8e0a8e4..3edcfbc6 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2957,6 +2957,7 @@ def get_mixed_data():
     )
 
 
+@pytest.mark.skip(reason="Test is not correct")
 @pytest.mark.parametrize(
     "formula",
     [

From b0b2d3e5d4694b2a96933715bd87ec0e43a6ab5c Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 8 Dec 2023 10:03:08 +0100
Subject: [PATCH 18/63] fix

---
 src/glum/_glm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 4cfdf523..3afaa54e 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1410,8 +1410,7 @@ def predict(
             )
             X = self._convert_from_pandas(X, context=captured_context)
 
-<<<<<<< HEAD
-=======
+
         X = check_array_tabmat_compliant(
             X,
             accept_sparse=["csr", "csc", "coo"],
@@ -1421,7 +1420,7 @@ def predict(
             allow_nd=False,
             drop_first=getattr(self, "drop_first", False),
         )
->>>>>>> main
+
         eta = self.linear_predictor(
             X, offset=offset, alpha_index=alpha_index, alpha=alpha
         )

From 16bd92599b64109126aaa40760e54e545c858479 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 8 Dec 2023 10:43:56 +0100
Subject: [PATCH 19/63] move feature name assignment to right spot

---
 src/glum/_glm.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 3afaa54e..95e7ad42 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -228,6 +228,20 @@ def _check_offset(
     return offset
 
 
+def _name_categorical_variables(
+    categories: tuple[str], column_name: str, drop_first: bool
+):
+    new_names = [
+        f"{column_name}__{category}" for category in categories[int(drop_first) :]
+    ]
+    if len(new_names) == 0:
+        raise ValueError(
+            f"Categorical column: {column_name}, contains only one category. "
+            + "This should be dropped from the feature matrix."
+        )
+    return new_names
+
+
 def _parse_formula(
     formula: FormulaSpec, include_intercept: bool = True
 ) -> tuple[Optional[Formula], Formula]:
@@ -2696,16 +2710,6 @@ def _set_up_and_check_fit_args(
                 self.term_names_ = list(
                     chain.from_iterable(
                         [term] * len(cols) for term, _, cols in X.model_spec.structure
-
-            if any(X.dtypes == "category"):
-                self.feature_names_ = list(
-                    chain.from_iterable(
-                        _name_categorical_variables(
-                            dtype.categories, column, getattr(self, "drop_first", False)
-                        )
-                        if isinstance(dtype, pd.CategoricalDtype)
-                        else [column]
-                        for column, dtype in zip(X.columns, X.dtypes)
                     )
                 )
 
@@ -2715,6 +2719,17 @@ def _set_up_and_check_fit_args(
                 self.feature_dtypes_ = X.dtypes.to_dict()
 
                 if any(X.dtypes == "category"):
+                    
+                    self.feature_names_ = list(
+                        chain.from_iterable(
+                            _name_categorical_variables(
+                                dtype.categories, column, getattr(self, "drop_first", False)
+                            )
+                            if isinstance(dtype, pd.CategoricalDtype)
+                            else [column]
+                            for column, dtype in zip(X.columns, X.dtypes)
+                        )
+                    )
 
                     def _expand_categorical_penalties(penalty, X, drop_first):
                         """

From a000baaca43137c5c6c2d38ec342d6f4188b7668 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 8 Dec 2023 10:46:44 +0100
Subject: [PATCH 20/63] fix

---
 src/glum/_glm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 95e7ad42..8ba34092 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1424,7 +1424,6 @@ def predict(
             )
             X = self._convert_from_pandas(X, context=captured_context)
 
-
         X = check_array_tabmat_compliant(
             X,
             accept_sparse=["csr", "csc", "coo"],
@@ -2719,11 +2718,13 @@ def _set_up_and_check_fit_args(
                 self.feature_dtypes_ = X.dtypes.to_dict()
 
                 if any(X.dtypes == "category"):
-                    
+
                     self.feature_names_ = list(
                         chain.from_iterable(
                             _name_categorical_variables(
-                                dtype.categories, column, getattr(self, "drop_first", False)
+                                dtype.categories,
+                                column,
+                                getattr(self, "drop_first", False),
                             )
                             if isinstance(dtype, pd.CategoricalDtype)
                             else [column]

From 2df83c129ef47640f1bdc7f7af63e5f0dd554c3d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 8 Dec 2023 11:24:48 +0100
Subject: [PATCH 21/63] remove blank line

---
 src/glum/_glm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 8ba34092..58244353 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2718,7 +2718,6 @@ def _set_up_and_check_fit_args(
                 self.feature_dtypes_ = X.dtypes.to_dict()
 
                 if any(X.dtypes == "category"):
-
                     self.feature_names_ = list(
                         chain.from_iterable(
                             _name_categorical_variables(

From 447c348ec32ed9019e553a7cf51e6cc996047d20 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 8 Dec 2023 17:11:22 +0100
Subject: [PATCH 22/63] bump minimum formulaic version (stateful transforms)

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 47c6a174..f621d424 100644
--- a/environment.yml
+++ b/environment.yml
@@ -13,7 +13,7 @@ dependencies:
   - scikit-learn>=0.23
   - scipy
   - tqdm
-  - formulaic>=0.4
+  - formulaic>=0.6
 
   # development tools
   - black

From bb0a188d3d67346a4af7a642e0fb833bd40c6a77 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 8 Dec 2023 17:13:24 +0100
Subject: [PATCH 23/63] improve backward compatibility

---
 src/glum/_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 58244353..3f16f9bc 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2662,7 +2662,7 @@ def _set_up_and_check_fit_args(
         copy_X = self._should_copy_X()
 
         if isinstance(X, pd.DataFrame):
-            if self.formula is not None:
+            if hasattr(self, "formula") and self.formula is not None:
                 lhs, rhs = _parse_formula(
                     self.formula, include_intercept=self.fit_intercept
                 )

From 512740d8ecd845c16f3f4420fc209e76498bfc7f Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Tue, 12 Dec 2023 09:55:10 +0100
Subject: [PATCH 24/63] Remove code that is not needed in tabmat v4 / glum v3
 (#741)

* Remove check_array from predict()

We don't need it here as predict calls linear_redictor, and the latter does this check. We can avoid doing it twice.

* Remove _name_categorical_variable parts

There is no need for those as Tabmat v4 handles variable names internally.

---------

Co-authored-by: Martin Stancsics <martin.stancsics@gmail.com>
---
 src/glum/_glm.py | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 3f16f9bc..5b2f9b0d 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -228,20 +228,6 @@ def _check_offset(
     return offset
 
 
-def _name_categorical_variables(
-    categories: tuple[str], column_name: str, drop_first: bool
-):
-    new_names = [
-        f"{column_name}__{category}" for category in categories[int(drop_first) :]
-    ]
-    if len(new_names) == 0:
-        raise ValueError(
-            f"Categorical column: {column_name}, contains only one category. "
-            + "This should be dropped from the feature matrix."
-        )
-    return new_names
-
-
 def _parse_formula(
     formula: FormulaSpec, include_intercept: bool = True
 ) -> tuple[Optional[Formula], Formula]:
@@ -1424,16 +1410,6 @@ def predict(
             )
             X = self._convert_from_pandas(X, context=captured_context)
 
-        X = check_array_tabmat_compliant(
-            X,
-            accept_sparse=["csr", "csc", "coo"],
-            dtype="numeric",
-            copy=self._should_copy_X(),
-            ensure_2d=True,
-            allow_nd=False,
-            drop_first=getattr(self, "drop_first", False),
-        )
-
         eta = self.linear_predictor(
             X, offset=offset, alpha_index=alpha_index, alpha=alpha
         )
@@ -2718,18 +2694,6 @@ def _set_up_and_check_fit_args(
                 self.feature_dtypes_ = X.dtypes.to_dict()
 
                 if any(X.dtypes == "category"):
-                    self.feature_names_ = list(
-                        chain.from_iterable(
-                            _name_categorical_variables(
-                                dtype.categories,
-                                column,
-                                getattr(self, "drop_first", False),
-                            )
-                            if isinstance(dtype, pd.CategoricalDtype)
-                            else [column]
-                            for column, dtype in zip(X.columns, X.dtypes)
-                        )
-                    )
 
                     def _expand_categorical_penalties(penalty, X, drop_first):
                         """

From ba5597f31fc2182f00507633c5d74858ec95c23d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Tue, 9 Jan 2024 15:30:33 +0100
Subject: [PATCH 25/63] Fix formula test: consider presence of intercept in
 full rankness check when constructing the model matrix externally (#746)

* deal with intercept in formula test correctly

* naming [skip ci]
---
 tests/glm/test_glm.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 9d75aeea..590ca678 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2957,7 +2957,6 @@ def get_mixed_data():
     )
 
 
-@pytest.mark.skip(reason="Test is not correct")
 @pytest.mark.parametrize(
     "formula",
     [
@@ -2967,17 +2966,17 @@ def get_mixed_data():
         pytest.param("y ~ c1", id="categorical"),
         pytest.param("y ~ c1 + 1", id="categorical_intercept"),
         pytest.param("y ~ x1 * c1 * c2", id="interaction"),
+        pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_and_categorical"),
+        pytest.param("y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"),
     ],
 )
 @pytest.mark.parametrize(
     "drop_first", [True, False], ids=["drop_first", "no_drop_first"]
 )
 def test_formula(get_mixed_data, formula, drop_first):
+    """Model with formula and model with externally constructed model matrix should match."""
     data = get_mixed_data
-    y_pd, X_pd = formulaic.model_matrix(
-        formula + " - 1", data, ensure_full_rank=drop_first
-    )
-    y_pd = y_pd.iloc[:, 0]
+
     model_formula = GeneralizedLinearRegressor(
         family="normal",
         drop_first=drop_first,
@@ -2985,18 +2984,29 @@ def test_formula(get_mixed_data, formula, drop_first):
         fit_intercept=False,
         categorical_format="{name}[T.{category}]",
     ).fit(data)
-
     has_intercept = "1" in model_formula.X_model_spec_.terms
-    model_pandas = GeneralizedLinearRegressor(
+
+    if has_intercept:
+        # full rank check must consider presence of intercept
+        y_ext, X_ext = formulaic.model_matrix(
+            formula, data, ensure_full_rank=drop_first
+        )
+        X_ext = X_ext.drop(columns="Intercept")
+    else:
+        y_ext, X_ext = formulaic.model_matrix(
+            formula + "-1", data, ensure_full_rank=drop_first
+        )
+    y_ext = y_ext.iloc[:, 0]
+    model_ext = GeneralizedLinearRegressor(
         family="normal",
         drop_first=drop_first,
         fit_intercept=has_intercept,
         categorical_format="{name}[T.{category}]",
-    ).fit(X_pd, y_pd)
+    ).fit(X_ext, y_ext)
 
-    np.testing.assert_almost_equal(model_pandas.coef_, model_formula.coef_)
+    np.testing.assert_almost_equal(model_ext.coef_, model_formula.coef_)
     np.testing.assert_array_equal(
-        model_pandas.feature_names_, model_formula.feature_names_
+        model_ext.feature_names_, model_formula.feature_names_
     )
 
 

From fd943e435089331fac6e71caef77fea4bc31bafb Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Thu, 11 Jan 2024 19:34:07 +0100
Subject: [PATCH 26/63] test varying significance level in coef table test
 (#749)

---
 tests/glm/test_glm.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 590ca678..82028a91 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2102,8 +2102,8 @@ def test_inputtype_std_errors(regression_data, categorical, split, fit_intercept
 
 
 @pytest.mark.parametrize("fit_intercept", [True, False])
-@pytest.mark.parametrize("significance_level", [0.01, 0.05])
-def test_coef_table(regression_data, fit_intercept, significance_level):
+@pytest.mark.parametrize("confidence_level", [0.95, 0.99])
+def test_coef_table(regression_data, fit_intercept, confidence_level):
     X, y = regression_data
     colnames = ["dog", "cat", "bat", "cow", "eel", "fox", "bee", "owl", "pig", "rat"]
     X_df = pd.DataFrame(X, columns=colnames)
@@ -2120,7 +2120,7 @@ def test_coef_table(regression_data, fit_intercept, significance_level):
 
     # Make the covariance matrices the same to focus on the coefficient table
     mdl.covariance_matrix_ = mdl_sm.fit(cov_type="nonrobust").cov_params()
-    our_table = mdl.coef_table()
+    our_table = mdl.coef_table(confidence_level=confidence_level)
 
     if fit_intercept:
         colnames = ["intercept"] + colnames
@@ -2131,7 +2131,9 @@ def test_coef_table(regression_data, fit_intercept, significance_level):
     np.testing.assert_allclose(our_table["t_value"], fit_sm.tvalues, rtol=1e-8)
     np.testing.assert_allclose(our_table["p_value"], fit_sm.pvalues, atol=1e-8)
     np.testing.assert_allclose(
-        our_table[["ci_lower", "ci_upper"]], fit_sm.conf_int(), rtol=1e-8
+        our_table[["ci_lower", "ci_upper"]],
+        fit_sm.conf_int(alpha=1 - confidence_level),
+        rtol=1e-8,
     )
 
 
@@ -2967,7 +2969,9 @@ def get_mixed_data():
         pytest.param("y ~ c1 + 1", id="categorical_intercept"),
         pytest.param("y ~ x1 * c1 * c2", id="interaction"),
         pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_and_categorical"),
-        pytest.param("y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"),
+        pytest.param(
+            "y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"
+        ),
     ],
 )
 @pytest.mark.parametrize(

From cff6ec4f9f99bd5f79450eff408a2f0ae074851e Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Mon, 15 Jan 2024 09:12:17 +0100
Subject: [PATCH 27/63] pin formulaic to 0.6 (#752)

---
 conda.recipe/meta.yaml | 2 +-
 setup.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 54523bc8..938db6d9 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -35,7 +35,7 @@ requirements:
     - pandas
     - scikit-learn >=0.23
     - scipy
-    - formulaic >=0.4
+    - formulaic >=0.6
     - tabmat >=4.0.0a
 
 test:
diff --git a/setup.py b/setup.py
index 68e6ff3f..cf21ad77 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,7 @@
         "pandas",
         "scikit-learn>=0.23",
         "scipy",
-        "formulaic>=0.4",
+        "formulaic>=0.6",
         "tabmat>=4.0.0a",
     ],
     entry_points=None

From f6f5d7cf38ec6f6069f7790aa1486f1814abb23d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Mon, 15 Jan 2024 09:13:33 +0100
Subject: [PATCH 28/63] Add illustration of formula interface to example in
 README (#751)

* add illustration of formula to readme

* rephrase

* spacing

* add linear term for illustration
---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 84c9448c..d7cd00f8 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ Why did we choose the name `glum`? We wanted a name that had the letters GLM and
 >>>
 >>> _ = model.fit(X=X, y=y)
 >>>
->>> # .report_diagnostics shows details about the steps taken by the iterative solver
+>>> # .report_diagnostics shows details about the steps taken by the iterative solver.
 >>> diags = model.get_formatted_diagnostics(full_report=True)
 >>> diags[['objective_fct']]
         objective_fct
@@ -79,6 +79,15 @@ n_iter
 3            0.443681
 4            0.443498
 5            0.443497
+>>>
+>>> # Models can also be built with formulas from formulaic.
+>>> model_formula = GeneralizedLinearRegressor(
+...     family='binomial',
+...     l1_ratio=1.0,
+...     alpha=0.001,
+...     formula="bedrooms + np.log(bathrooms + 1) + bs(sqft_living, 3) + C(waterfront)"
+... )
+>>> _ = model_formula.fit(X=house_data.data, y=y)
 
 ```
 

From 7d0b8adcda4c49da4bccd2b90cc5028300deacb1 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Mon, 15 Jan 2024 09:21:44 +0100
Subject: [PATCH 29/63] Determine presence of intercept only by `fit_intercept`
 argument (#747)

* always use self.fit_intercept; raise if formula conflicts with it

* wording [skip ci]

* adjust other tests, cosmetics

* don't compare specs with singular matrix to smf

* fix smf test formula

* fix intercept in context test

* remove outdated sentence; clean up

* fix

* adjust tutorial

* adjust tutorial
---
 .../formula_interface/formula_interface.ipynb | 147 ++----------------
 src/glum/_glm.py                              |  13 +-
 tests/glm/test_glm.py                         |  97 +++++++-----
 3 files changed, 81 insertions(+), 176 deletions(-)

diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
index 9890d28b..acdf50ea 100644
--- a/docs/tutorials/formula_interface/formula_interface.ipynb
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -23,7 +23,7 @@
     "Formulas can provide a concise and convenient way to specify many of the usual pre-processing steps, such as converting to categorical types, creating interactions, applying transformations, or even spline interpolation. As an example, consider the following formula:\n",
     "\n",
     "```\n",
-    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3) + 1\n",
+    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3)\n",
     "```\n",
     "\n",
     "Despite its brevity, it describes all of the following:\n",
@@ -32,7 +32,6 @@
     " - If there are missing values in `DrivAge`, they should be treated as a separate category.\n",
     " - On the other hand, missing values in `VehPower` should be treated as all-zero indicators.\n",
     " - The predictors should also include a third degree B-spline interpolation of `BonusMalus`.\n",
-    " - The model should include an intercept.\n",
     "\n",
     "The following chapters demonstrate each of these features in some detail, as well as some additional advantages of using the formula interface."
    ]
@@ -59,6 +58,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "import pytest\n",
     "import scipy.optimize as optimize\n",
     "import scipy.stats\n",
     "from dask_ml.preprocessing import Categorizer\n",
@@ -1261,144 +1261,27 @@
    "source": [
     "### Intercept Term\n",
     "\n",
-    "Just like in the case of the non-formula interface, an intercept term is added by default. This can be disabled by either setting the `fit_intercept` parameter to `False`, or adding `+0` or `-1` to the end of the formula. In the case of conflict, a warning is emitted, and the latter takes precedence."
+    "Just like in the case of the non-formula interface, the presence of an intercept is determined by the `fit_intercept` argument. In case that the formula specifies a different behavior (e.g., adding `+0` or `-1` while `fit_intercept=True`), an error will be raised."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/stanmart/work/glum/src/glum/_glm.py:2354: UserWarning: The formula explicitly sets the intercept to False, overriding fit_intercept=True.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intercept</th>\n",
-       "      <th>DrivAge__0</th>\n",
-       "      <th>DrivAge__1</th>\n",
-       "      <th>DrivAge__2</th>\n",
-       "      <th>DrivAge__3</th>\n",
-       "      <th>DrivAge__4</th>\n",
-       "      <th>DrivAge__5</th>\n",
-       "      <th>DrivAge__6</th>\n",
-       "      <th>VehPower__4</th>\n",
-       "      <th>VehPower__5</th>\n",
-       "      <th>...</th>\n",
-       "      <th>DrivAge__4__x__VehPower__8</th>\n",
-       "      <th>DrivAge__5__x__VehPower__8</th>\n",
-       "      <th>DrivAge__6__x__VehPower__8</th>\n",
-       "      <th>DrivAge__0__x__VehPower__9</th>\n",
-       "      <th>DrivAge__1__x__VehPower__9</th>\n",
-       "      <th>DrivAge__2__x__VehPower__9</th>\n",
-       "      <th>DrivAge__3__x__VehPower__9</th>\n",
-       "      <th>DrivAge__4__x__VehPower__9</th>\n",
-       "      <th>DrivAge__5__x__VehPower__9</th>\n",
-       "      <th>DrivAge__6__x__VehPower__9</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>coefficient</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.713298</td>\n",
-       "      <td>0.783505</td>\n",
-       "      <td>0.205914</td>\n",
-       "      <td>0.016085</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.000094</td>\n",
-       "      <td>0.223685</td>\n",
-       "      <td>4.66123</td>\n",
-       "      <td>4.736272</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-0.144927</td>\n",
-       "      <td>0.001657</td>\n",
-       "      <td>0.515373</td>\n",
-       "      <td>0.714834</td>\n",
-       "      <td>-0.325666</td>\n",
-       "      <td>-0.370935</td>\n",
-       "      <td>0.20417</td>\n",
-       "      <td>0.013222</td>\n",
-       "      <td>-0.273913</td>\n",
-       "      <td>0.115693</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1 rows × 56 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
-       "coefficient        0.0    1.713298    0.783505    0.205914    0.016085   \n",
-       "\n",
-       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
-       "coefficient         0.0    0.000094    0.223685      4.66123     4.736272   \n",
-       "\n",
-       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
-       "coefficient  ...                   -0.144927                    0.001657   \n",
-       "\n",
-       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
-       "coefficient                    0.515373                    0.714834   \n",
-       "\n",
-       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
-       "coefficient                   -0.325666                   -0.370935   \n",
-       "\n",
-       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
-       "coefficient                     0.20417                    0.013222   \n",
-       "\n",
-       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
-       "coefficient                   -0.273913                    0.115693  \n",
-       "\n",
-       "[1 rows x 56 columns]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "formula_noint = \"PurePremium ~ DrivAge * VehPower - 1\"\n",
     "\n",
-    "t_glm6 = GeneralizedLinearRegressor(\n",
-    "    family=TweedieDist,\n",
-    "    alpha_search=True,\n",
-    "    l1_ratio=1,\n",
-    "    fit_intercept=True,\n",
-    "    formula=formula_noint,\n",
-    "    interaction_separator=\"__x__\",\n",
-    "    categorical_format=\"{name}__{category}\",\n",
-    ")\n",
-    "t_glm6.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
-    "\n",
-    "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm6.intercept_], t_glm6.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm6.feature_names_,\n",
-    ").T"
+    "with pytest.raises(ValueError, match=\"The formula sets the intercept to False\"):\n",
+    "    t_glm6 = GeneralizedLinearRegressor(\n",
+    "        family=TweedieDist,\n",
+    "        alpha_search=True,\n",
+    "        l1_ratio=1,\n",
+    "        fit_intercept=True,\n",
+    "        formula=formula_noint,\n",
+    "        interaction_separator=\"__x__\",\n",
+    "        categorical_format=\"{name}__{category}\",\n",
+    "    )"
    ]
   },
   {
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 5b2f9b0d..932fc44f 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -244,8 +244,7 @@ def _parse_formula(
     formula : FormulaSpec
         The formula to parse.
     include_intercept: bool, default True
-        Whether to include an intercept column if the formula does not
-        include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly.
+        Whether to include an intercept column.
 
     Returns
     -------
@@ -2673,11 +2672,11 @@ def _set_up_and_check_fit_args(
 
                 intercept = "1" in X.model_spec.terms
                 if intercept != self.fit_intercept:
-                    warnings.warn(
-                        f"The formula explicitly sets the intercept to {intercept}, "
-                        f"overriding fit_intercept={self.fit_intercept}."
+                    raise ValueError(
+                        f"The formula sets the intercept to {intercept}, "
+                        f"contradicting fit_intercept={self.fit_intercept}. "
+                        "You should use fit_intercept to specify the intercept."
                     )
-                    self.fit_intercept = intercept
 
                 self.X_model_spec_ = X.model_spec
 
@@ -3104,6 +3103,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     expected_information : bool, optional (default = False)
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
+
     formula : FormulaSpec
         A formula accepted by formulaic. It can either be a one-sided formula, in
         which case ``y`` must be specified in ``fit``, or a two-sided formula, in
@@ -3130,6 +3130,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         - if 'zero', missing values will represent all-zero indicator columns.
         - if 'convert', missing values will be converted to the ``cat_missing_name``
           category.
+
     cat_missing_name: str, default='(MISSING)'
         Name of the category to which missing values will be converted if
         ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 82028a91..4f83ffe7 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -2962,22 +2962,20 @@ def get_mixed_data():
 @pytest.mark.parametrize(
     "formula",
     [
-        pytest.param("y ~ x1 + x2", id="implicit_no_intercept"),
-        pytest.param("y ~ x1 + x2 + 1", id="intercept"),
-        pytest.param("y ~ x1 + x2 - 1", id="no_intercept"),
+        pytest.param("y ~ x1 + x2", id="numeric"),
         pytest.param("y ~ c1", id="categorical"),
-        pytest.param("y ~ c1 + 1", id="categorical_intercept"),
-        pytest.param("y ~ x1 * c1 * c2", id="interaction"),
-        pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_and_categorical"),
-        pytest.param(
-            "y ~ x1 + x2 + c1 + c2 + 1", id="numeric_and_categorical_intercept"
-        ),
+        pytest.param("y ~ c1 * c2", id="categorical_interaction"),
+        pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_categorical"),
+        pytest.param("y ~ x1 * c1 * c2", id="numeric_categorical_interaction"),
     ],
 )
 @pytest.mark.parametrize(
     "drop_first", [True, False], ids=["drop_first", "no_drop_first"]
 )
-def test_formula(get_mixed_data, formula, drop_first):
+@pytest.mark.parametrize(
+    "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
+)
+def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
     """Model with formula and model with externally constructed model matrix should match."""
     data = get_mixed_data
 
@@ -2985,12 +2983,11 @@ def test_formula(get_mixed_data, formula, drop_first):
         family="normal",
         drop_first=drop_first,
         formula=formula,
-        fit_intercept=False,
+        fit_intercept=fit_intercept,
         categorical_format="{name}[T.{category}]",
     ).fit(data)
-    has_intercept = "1" in model_formula.X_model_spec_.terms
 
-    if has_intercept:
+    if fit_intercept:
         # full rank check must consider presence of intercept
         y_ext, X_ext = formulaic.model_matrix(
             formula, data, ensure_full_rank=drop_first
@@ -3001,10 +2998,11 @@ def test_formula(get_mixed_data, formula, drop_first):
             formula + "-1", data, ensure_full_rank=drop_first
         )
     y_ext = y_ext.iloc[:, 0]
+
     model_ext = GeneralizedLinearRegressor(
         family="normal",
         drop_first=drop_first,
-        fit_intercept=has_intercept,
+        fit_intercept=fit_intercept,
         categorical_format="{name}[T.{category}]",
     ).fit(X_ext, y_ext)
 
@@ -3014,6 +3012,17 @@ def test_formula(get_mixed_data, formula, drop_first):
     )
 
 
+def test_formula_explicit_intercept(get_mixed_data):
+    data = get_mixed_data
+
+    with pytest.raises(ValueError, match="The formula sets the intercept to False"):
+        GeneralizedLinearRegressor(
+            family="normal",
+            formula="y ~ x1 - 1",
+            fit_intercept=True,
+        ).fit(data)
+
+
 @pytest.mark.parametrize(
     "formula, feature_names, term_names",
     [
@@ -3091,26 +3100,31 @@ def test_formula_names_old_glum_style(
 @pytest.mark.parametrize(
     "formula",
     [
-        pytest.param("y ~ x1 + x2", id="implicit_no_intercept"),
-        pytest.param("y ~ x1 + x2 + 1", id="intercept"),
-        pytest.param("y ~ x1 + x2 - 1", id="no_intercept"),
+        pytest.param("y ~ x1 + x2", id="numeric"),
         pytest.param("y ~ c1", id="categorical"),
-        pytest.param("y ~ c1 + 1", id="categorical_intercept"),
-        pytest.param("y ~ c1 * c2", id="interaction"),
+        pytest.param("y ~ c1 * c2", id="categorical_interaction"),
     ],
 )
-def test_formula_against_smf(get_mixed_data, formula):
+@pytest.mark.parametrize(
+    "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
+)
+def test_formula_against_smf(get_mixed_data, formula, fit_intercept):
     data = get_mixed_data
     model_formula = GeneralizedLinearRegressor(
-        family="normal", drop_first=True, formula=formula, alpha=0.0
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        alpha=0.0,
+        fit_intercept=fit_intercept,
     ).fit(data)
 
-    if model_formula.fit_intercept:
+    if fit_intercept:
         beta_formula = np.concatenate([[model_formula.intercept_], model_formula.coef_])
     else:
         beta_formula = model_formula.coef_
 
-    model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
+    formula_smf = formula + "- 1" if not fit_intercept else formula
+    model_smf = smf.glm(formula_smf, data, family=sm.families.Gaussian()).fit()
 
     np.testing.assert_almost_equal(beta_formula, model_smf.params)
 
@@ -3120,40 +3134,47 @@ def test_formula_context(get_mixed_data):
     x_context = np.arange(len(data), dtype=float)  # noqa: F841
     formula = "y ~ x1 + x2 + x_context"
     model_formula = GeneralizedLinearRegressor(
-        family="normal", drop_first=True, formula=formula, alpha=0.0
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        alpha=0.0,
+        fit_intercept=True,
     ).fit(data)
 
-    if model_formula.fit_intercept:
-        beta_formula = np.concatenate([[model_formula.intercept_], model_formula.coef_])
-    else:
-        beta_formula = model_formula.coef_
-
     model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
 
-    np.testing.assert_almost_equal(beta_formula, model_smf.params)
+    np.testing.assert_almost_equal(
+        np.concatenate([[model_formula.intercept_], model_formula.coef_]),
+        model_smf.params,
+    )
     np.testing.assert_almost_equal(model_formula.predict(data), model_smf.predict(data))
 
 
 @pytest.mark.parametrize(
     "formula",
     [
-        pytest.param("y ~ x1 + x2", id="implicit_no_intercept"),
-        pytest.param("y ~ x1 + x2 + 1", id="intercept"),
-        pytest.param("y ~ x1 + x2 - 1", id="no_intercept"),
+        pytest.param("y ~ x1 + x2", id="numeric"),
         pytest.param("y ~ c1", id="categorical"),
-        pytest.param("y ~ c1 + 1", id="categorical_intercept"),
-        pytest.param("y ~ c1 * c2", id="interaction"),
+        pytest.param("y ~ c1 * c2", id="categorical_interaction"),
     ],
 )
-def test_formula_predict(get_mixed_data, formula):
+@pytest.mark.parametrize(
+    "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
+)
+def test_formula_predict(get_mixed_data, formula, fit_intercept):
     data = get_mixed_data
     data_unseen = data.copy()
     data_unseen.loc[data_unseen["c1"] == "b", "c1"] = "c"
     model_formula = GeneralizedLinearRegressor(
-        family="normal", drop_first=True, formula=formula, alpha=0.0
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        alpha=0.0,
+        fit_intercept=fit_intercept,
     ).fit(data)
 
-    model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
+    formula_smf = formula + "- 1" if not fit_intercept else formula
+    model_smf = smf.glm(formula_smf, data, family=sm.families.Gaussian()).fit()
 
     yhat_formula = model_formula.predict(data_unseen)
     yhat_smf = model_smf.predict(data_unseen)

From 72971f44776bfd2a8d9bf3b6daf717b07ead7345 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 15 Jan 2024 10:02:12 +0100
Subject: [PATCH 30/63] consistent linebreaks in docstring

---
 src/glum/_glm.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 932fc44f..71bd142b 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1308,6 +1308,7 @@ def linear_predictor(
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
             depth. If a dict, it is used as the context directly.
+
         Returns
         -------
         array, shape (n_samples, n_alphas)
@@ -1398,6 +1399,7 @@ def predict(
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
             depth. If a dict, it is used as the context directly.
+
         Returns
         -------
         array, shape (n_samples, n_alphas)
@@ -1471,6 +1473,7 @@ def coef_table(
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
             depth. If a dict, it is used as the context directly.
+
         Returns
         -------
         pandas.DataFrame
@@ -1600,6 +1603,7 @@ def wald_test(
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
             depth. If a dict, it is used as the context directly.
+
         Returns
         -------
         WaldTestResult
@@ -1747,6 +1751,7 @@ def _wald_test_matrix(
             If not specified, the model's ``expected_information`` attribute is used.
         context : Optional[Mapping[str, Any]], default=None
             The context to use for evaluating the formula.
+
         Returns
         -------
         WaldTestResult
@@ -1851,6 +1856,7 @@ def _wald_test_feature_names(
             If not specified, the model's ``expected_information`` attribute is used.
         context : Optional[Mapping[str, Any]], default=None
             The context to use for evaluating the formula.
+
         Returns
         -------
         WaldTestResult
@@ -1946,6 +1952,7 @@ def _wald_test_formula(
             If not specified, the model's ``expected_information`` attribute is used.
         context : Optional[Mapping[str, Any]], default=None
             The context to use for evaluating the formula.
+
         Returns
         -------
         WaldTestResult
@@ -2034,6 +2041,7 @@ def _wald_test_term_names(
             If not specified, the model's ``expected_information`` attribute is used.
         context : Optional[Mapping[str, Any]], default=None
             The context to use for evaluating the formula.
+
         Returns
         -------
         WaldTestResult
@@ -2478,6 +2486,7 @@ def score(
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
             depth. If a dict, it is used as the context directly.
+
         Returns
         -------
         float

From 6b2b8444ba7883d6ef08434c31dc1de8ecd1bd7f Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 22 Jan 2024 16:47:03 +0100
Subject: [PATCH 31/63] remove obsolete arg in docstring

---
 src/glum/_glm.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 71bd142b..d5134d8b 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -3121,11 +3121,6 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     interaction_separator: str, default ":"
         The separator between the names of interacted variables.
 
-    categorical_format: str, default "{name}[T.{category}]"
-        The format string used to generate the names of categorical variables.
-        Has to include the placeholders ``{name}`` and ``{category}``.
-        Only used if ``formula`` is not ``None``.
-
     categorical_format : str, optional, default='{name}[{category}]'
         Format string for categorical features. The format string should
         contain the placeholder ``{name}`` for the feature name and

From 1ad8be2295a175b845afccef26e8009b993374fa Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Mon, 29 Jan 2024 15:43:41 +0100
Subject: [PATCH 32/63] Informative error when encountering categories that
 were not seen in training (#748)

* drop missings not seen in training

* zero not drop

* better (?) name [skip ci]

* catch case of unseen missings and fail method

* fix

* respect categorical missing method with formula; test different categorical missing methods also with formula

* shorten the tests

* dont allow fitting in case of conversion of categoricals and presence of formula

* clearer error msg

* also change the error msg in the regex (facepalm)

* remove matches

* fix

* better name

* describe more restrictive behavior in tutorial

* Raise error on unseen levels when predicting

* Allow cat_missing_method='convert' again

* Update test

* Check for unseen categories

* Adapt align_df_categories tests to changes

* Make pre-commit happy

* Avoid unnecessary work

* Correctly expand penalties with categoricals and `cat_missing_method="convert"` (#753)

* Correctyl expand penalties when cat_missing_method=convert

* Add test

* Improve variable names

Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>

---------

Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>

* bump tabmat pre-release version

---------

Co-authored-by: Martin Stancsics <martin.stancsics@gmail.com>
---
 conda.recipe/meta.yaml  |  2 +-
 environment.yml         |  2 +-
 setup.py                |  2 +-
 src/glum/_glm.py        | 42 ++++++++++++++++----
 src/glum/_util.py       | 22 ++++++++++-
 tests/glm/test_glm.py   | 68 +++++++++++++++++++++++++++----
 tests/glm/test_utils.py | 88 ++++++++++++++++++++++++++++++++++++-----
 7 files changed, 197 insertions(+), 29 deletions(-)

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 938db6d9..35218f7c 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - scikit-learn >=0.23
     - scipy
     - formulaic >=0.6
-    - tabmat >=4.0.0a
+    - tabmat >=4.0.0a3
 
 test:
   requires:
diff --git a/environment.yml b/environment.yml
index f621d424..d0d7d172 100644
--- a/environment.yml
+++ b/environment.yml
@@ -9,7 +9,7 @@ dependencies:
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
   - pandas>=0.21
-  - tabmat>=4.0.0a
+  - tabmat>=4.0.0a3
   - scikit-learn>=0.23
   - scipy
   - tqdm
diff --git a/setup.py b/setup.py
index cf21ad77..515c68c2 100644
--- a/setup.py
+++ b/setup.py
@@ -87,7 +87,7 @@
         "scikit-learn>=0.23",
         "scipy",
         "formulaic>=0.6",
-        "tabmat>=4.0.0a",
+        "tabmat>=4.0.0a3",
     ],
     entry_points=None
     if os.environ.get("CONDA_BUILD")
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index d5134d8b..ca26e4e6 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -879,12 +879,18 @@ def _convert_from_pandas(
         self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
     ) -> tm.MatrixBase:
         """Convert a pandas data frame to a tabmat matrix."""
-
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
+        cat_missing_method_after_alignment = self.cat_missing_method
+
         if hasattr(self, "feature_dtypes_"):
-            df = _align_df_categories(df, self.feature_dtypes_)
+            df = _align_df_categories(
+                df,
+                self.feature_dtypes_,
+                self.has_missing_category_,
+                self.cat_missing_method,
+            )
             if self.cat_missing_method == "convert":
                 df = _add_missing_categories(
                     df=df,
@@ -893,12 +899,14 @@ def _convert_from_pandas(
                     cat_missing_name=self.cat_missing_name,
                     categorical_format=self.categorical_format,
                 )
+                # there should be no missing categories after this
+                cat_missing_method_after_alignment = "fail"
 
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
             categorical_format=self.categorical_format,
-            cat_missing_method=self.cat_missing_method,
+            cat_missing_method=cat_missing_method_after_alignment,
         )
 
         return X
@@ -2674,6 +2682,7 @@ def _set_up_and_check_fit_args(
                     include_intercept=False,
                     ensure_full_rank=self.drop_first,
                     categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
                     interaction_separator=self.interaction_separator,
                     add_column_for_intercept=False,
                     context=context,
@@ -2700,10 +2709,17 @@ def _set_up_and_check_fit_args(
                 # Maybe TODO: expand categorical penalties with formulas
 
                 self.feature_dtypes_ = X.dtypes.to_dict()
+                self.has_missing_category_ = {
+                    col: (self.cat_missing_method == "convert") and X[col].isna().any()
+                    for col, dtype in self.feature_dtypes_.items()
+                    if isinstance(dtype, pd.CategoricalDtype)
+                }
 
                 if any(X.dtypes == "category"):
 
-                    def _expand_categorical_penalties(penalty, X, drop_first):
+                    def _expand_categorical_penalties(
+                        penalty, X, drop_first, has_missing_category
+                    ):
                         """
                         If P1 or P2 has the same shape as X before expanding the
                         categoricals, we assume that the penalty at the location of
@@ -2727,19 +2743,29 @@ def _expand_categorical_penalties(penalty, X, drop_first):
                                     chain.from_iterable(
                                         [
                                             elmt
-                                            for _ in dtype.categories[int(drop_first) :]
+                                            for _ in range(
+                                                len(dtype.categories)
+                                                + has_missing_category[col]
+                                                - drop_first
+                                            )
                                         ]
                                         if pd.api.types.is_categorical_dtype(dtype)
                                         else [elmt]
-                                        for elmt, dtype in zip(penalty, X.dtypes)
+                                        for elmt, (col, dtype) in zip(
+                                            penalty, X.dtypes.items()
+                                        )
                                     )
                                 )
                             )
                         else:
                             return penalty
 
-                    P1 = _expand_categorical_penalties(self.P1, X, self.drop_first)
-                    P2 = _expand_categorical_penalties(self.P2, X, self.drop_first)
+                    P1 = _expand_categorical_penalties(
+                        self.P1, X, self.drop_first, self.has_missing_category_
+                    )
+                    P2 = _expand_categorical_penalties(
+                        self.P2, X, self.drop_first, self.has_missing_category_
+                    )
 
                 X = tm.from_pandas(
                     X,
diff --git a/src/glum/_util.py b/src/glum/_util.py
index 24b08f40..f5c463ff 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -15,7 +15,9 @@ def _asanyarray(x, **kwargs):
     return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)
 
 
-def _align_df_categories(df, dtypes) -> pd.DataFrame:
+def _align_df_categories(
+    df, dtypes, has_missing_category, cat_missing_method
+) -> pd.DataFrame:
     """Align data types for prediction.
 
     This function checks that categorical columns have same categories in the
@@ -26,6 +28,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     ----------
     df : pandas.DataFrame
     dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
+    has_missing_category : Dict[str, bool]
+    missing_method : str
     """
     if not isinstance(df, pd.DataFrame):
         raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
@@ -47,6 +51,22 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
             changed_dtypes[column] = df[column].cat.set_categories(
                 dtypes[column].categories
             )
+        else:
+            continue
+
+        if cat_missing_method == "convert" and not has_missing_category[column]:
+            unseen_categories = set(df[column].unique()) - set(
+                dtypes[column].categories
+            )
+        else:
+            unseen_categories = set(df[column].dropna().unique()) - set(
+                dtypes[column].categories
+            )
+
+        if unseen_categories:
+            raise ValueError(
+                f"Column {column} contains unseen categories: {unseen_categories}."
+            )
 
     if changed_dtypes:
         df = df.assign(**changed_dtypes)
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 4f83ffe7..469f464e 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -53,7 +53,7 @@
 
 
 def get_small_x_y(
-    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
+    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
 ) -> tuple[np.ndarray, np.ndarray]:
     if isinstance(estimator, GeneralizedLinearRegressor):
         n_rows = 1
@@ -362,6 +362,43 @@ def test_P1_P2_expansion_with_categoricals():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
 
+def test_P1_P2_expansion_with_categoricals_missings():
+    rng = np.random.default_rng(42)
+    X = pd.DataFrame(
+        data={
+            "dense": np.linspace(0, 10, 60),
+            "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
+        }
+    )
+    y = rng.normal(size=60)
+
+    mdl1 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2, 2, 2, 2, 2],
+        P2=[2, 1, 1, 1, 1, 1],
+        cat_missing_method="convert",
+    )
+    mdl1.fit(X, y)
+
+    mdl2 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=[2, 1],
+        cat_missing_method="convert",
+    )
+    mdl2.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+
+    mdl3 = GeneralizedLinearRegressor(
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=sparse.diags([2, 1, 1, 1, 1, 1]),
+        cat_missing_method="convert",
+    )
+    mdl3.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)
+
+
 @pytest.mark.parametrize(
     "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
 )
@@ -3183,40 +3220,55 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):
 
 
 @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
-def test_cat_missing(cat_missing_method):
+@pytest.mark.parametrize("unseen_missing", [False, True])
+@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
+def test_cat_missing(cat_missing_method, unseen_missing, formula):
     X = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
             "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
         }
     )
+    if unseen_missing:
+        X = X.dropna()
     X_unseen = pd.DataFrame(
         {
             "cat_1": pd.Categorical([1, pd.NA]),
             "cat_2": pd.Categorical([1, 2]),
         }
     )
-    y = np.array([1, 2, 3, 4, 5])
+    y = np.array(X.index)
 
     model = GeneralizedLinearRegressor(
         family="normal",
         cat_missing_method=cat_missing_method,
         drop_first=False,
+        formula=formula,
         fit_intercept=False,
     )
-
-    if cat_missing_method == "fail":
-        with pytest.raises(ValueError):
+    if cat_missing_method == "fail" and not unseen_missing:
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
             model.fit(X, y)
     else:
         model.fit(X, y)
         feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
 
-        if cat_missing_method == "convert":
+        if cat_missing_method == "convert" and not unseen_missing:
             feature_names.insert(2, "cat_1[(MISSING)]")
             feature_names.append("cat_2[(MISSING)]")
 
         np.testing.assert_array_equal(model.feature_names_, feature_names)
         assert len(model.coef_) == len(feature_names)
 
-        model.predict(X_unseen)
+        if cat_missing_method == "fail" and unseen_missing:
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
+                model.predict(X_unseen)
+        elif cat_missing_method == "convert" and unseen_missing:
+            with pytest.raises(ValueError, match="contains unseen categories"):
+                model.predict(X_unseen)
+        else:
+            model.predict(X_unseen)
diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py
index 36cf988a..61471750 100644
--- a/tests/glm/test_utils.py
+++ b/tests/glm/test_utils.py
@@ -16,12 +16,15 @@ def df():
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
 
 def test_align_df_categories_numeric(df):
     dtypes = {column: np.float64 for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_categorical(df):
+    df = df[["x5", "x6", "x7", "x8"]]
     dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
-            "x1": [np.nan, np.nan],
-            "x2": [np.nan, np.nan],
-            "x3": [np.nan, np.nan],
-            "x4": [np.nan, np.nan],
             "x5": pd.Categorical(["a", "b"]),
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         },
         dtype=pd.CategoricalDtype(["a", "b"]),
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method),
+        expected,
+    )
 
 
 def test_align_df_categories_excess_columns(df):
     dtypes = {"x1": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_missing_columns(df):
     dtypes = {"x0": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -87,15 +103,69 @@ def test_align_df_categories_missing_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
+
+
+@pytest.mark.parametrize("has_missings", [False, True])
+def test_align_df_categories_convert(df, has_missings):
+    df = df[["x5", "x6", "x7", "x8"]]
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: has_missings for column in df}
+    missing_method = "convert"
+
+    expected = pd.DataFrame(
+        {
+            "x5": pd.Categorical(["a", "b"]),
+            "x6": pd.Categorical(["a", "b"]),
+            "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
+        },
+        dtype=pd.CategoricalDtype(["a", "b"]),
+    )
+
+    if has_missings:
+        pd.testing.assert_frame_equal(
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            ),
+            expected,
+        )
+    else:
+        with pytest.raises(ValueError, match="contains unseen categories"):
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            )
+
+
+def test_align_df_categories_raise_on_unseen(df):
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
+
+    with pytest.raises(ValueError, match="contains unseen categories"):
+        _align_df_categories(
+            df,
+            dtypes,
+            has_missing_category,
+            missing_method,
+        )
 
 
 def test_align_df_categories_not_df():
     with pytest.raises(TypeError):
-        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
+        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail")
 
 
 @pytest.fixture()

From 64f2b98ec5e98a5338b587add0d3c57e9af0820a Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 29 Jan 2024 18:51:04 +0100
Subject: [PATCH 33/63] docstring cosmetics

---
 src/glum/_glm.py | 50 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index ca26e4e6..80366d53 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -232,12 +232,12 @@ def _parse_formula(
     formula: FormulaSpec, include_intercept: bool = True
 ) -> tuple[Optional[Formula], Formula]:
     """
-    Parse and transform  the formula for use in a GeneralizedLinearRegressor.
+    Parse and transform the formula for use in a GeneralizedLinearRegressor.
 
     The left-hand side and right-hand side of the formula are separated. If an
-    intercept is present, it is removed from the right-hand side, and a boolean
-    flag is returned to indicate whether or not an intercept should be added to
-    the model.
+    intercept is present, it wil be removed from the right-hand side, and a
+    boolean flag to indicate whether or not an intercept should be added to
+    the model will be returned.
 
     Parameters
     ----------
@@ -1315,7 +1315,7 @@ def linear_predictor(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
 
         Returns
         -------
@@ -1406,7 +1406,7 @@ def predict(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
 
         Returns
         -------
@@ -1480,7 +1480,7 @@ def coef_table(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
 
         Returns
         -------
@@ -1563,7 +1563,7 @@ def wald_test(
 
         The right hand side of the tested hypothesis is specified by ``r``. In the
         case of a ``terms``-based test, the null hypothesis is that each coefficient
-        relating to a term is equal to the corresponding value in ``r``.
+        relating to a term equals the corresponding value in ``r``.
 
         Parameters
         ----------
@@ -1578,7 +1578,7 @@ def wald_test(
             of the expressions separated by ``+`` signs. Otherwise, a term is one column
             in the input data. As categorical variables need not be one-hot encoded in
             glum, in their case, the hypothesis to be tested is that the coefficients
-            for all of their levels are equal to ``r``.
+            of all categories are equal to ``r``.
         r : np.ndarray, optional, default=None
             The vector representing the values of the linear combination.
             If None, the test is for whether the linear combinations of the coefficients
@@ -1610,7 +1610,7 @@ def wald_test(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
 
         Returns
         -------
@@ -2019,7 +2019,7 @@ def _wald_test_term_names(
             of the expressions separated by ``+`` signs. Otherwise, a term is one column
             in the input data. As categorical variables need not be one-hot encoded in
             glum, in their case, the hypothesis to be tested is that the coefficients
-            for all of their levels are equal to ``r``.
+            of all categories are equal to ``r``.
         values: Sequence, optional, default=None
             The values to which coefficients are compared. If None, the test is
             for whether the coefficients are zero.
@@ -2157,7 +2157,8 @@ def std_errors(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly."""
+            depth. If a dict, it is directly used as the context.
+        """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
         )
@@ -2200,36 +2201,48 @@ def covariance_matrix(
         X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
             Training data. Can be omitted if a covariance matrix has already
             been computed.
+
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
+
         mu : array-like, optional, default=None
             Array with predictions. Estimated if absent.
         offset : array-like, optional, default=None
             Array with additive offsets.
+
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
+
         robust : boolean, optional, default=None
+
             Whether to compute robust standard errors instead of normal ones.
             If not specified, the model's ``robust`` attribute is used.
         clusters : array-like, optional, default=None
+
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
+
         expected_information : boolean, optional, default=None
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
+
             If not specified, the model's ``expected_information`` attribute is used.
         store_covariance_matrix : boolean, optional, default=False
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.
+
         skip_checks : boolean, optional, default=False
             Whether to skip input validation. For internal use only.
+
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
+
         Notes
         -----
         We support three types of covariance matrices:
@@ -2493,7 +2506,7 @@ def score(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
 
         Returns
         -------
@@ -3389,7 +3402,7 @@ def fit(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
 
         weights_sum: float, optional (default=None)
 
@@ -3708,7 +3721,7 @@ def aic(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3748,7 +3761,7 @@ def aicc(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly.
+            depth. If a dict, it is directly used as the context.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3792,7 +3805,8 @@ def bic(
         context : Optional[Union[int, Mapping[str, Any]]], default=0
             The context to use for evaluating the formula. If an integer, the
             context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is used as the context directly."""
+            depth. If a dict, it is directly used as the context.
+        """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
         )

From b185fe4830791bef0bf305c4b97a728f15ce2405 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 29 Jan 2024 18:54:54 +0100
Subject: [PATCH 34/63] even more docstring cosmetics

---
 src/glum/_glm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 80366d53..0c1d8b8c 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -235,7 +235,7 @@ def _parse_formula(
     Parse and transform the formula for use in a GeneralizedLinearRegressor.
 
     The left-hand side and right-hand side of the formula are separated. If an
-    intercept is present, it wil be removed from the right-hand side, and a
+    intercept is present, it will be removed from the right-hand side, and a
     boolean flag to indicate whether or not an intercept should be added to
     the model will be returned.
 
@@ -249,7 +249,8 @@ def _parse_formula(
     Returns
     -------
     tuple[Formula, Formula]
-        The left-hand side and right-hand sides of the formula."""
+        The left-hand side and right-hand sides of the formula.
+    """
     if isinstance(formula, str):
         parser = DefaultFormulaParser(include_intercept=include_intercept)
         terms = parser.get_terms(formula)
@@ -2208,6 +2209,7 @@ def covariance_matrix(
 
         mu : array-like, optional, default=None
             Array with predictions. Estimated if absent.
+
         offset : array-like, optional, default=None
             Array with additive offsets.
 
@@ -2218,19 +2220,18 @@ def covariance_matrix(
             The dispersion parameter. Estimated if absent.
 
         robust : boolean, optional, default=None
-
             Whether to compute robust standard errors instead of normal ones.
             If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
 
+        clusters : array-like, optional, default=None
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
 
         expected_information : boolean, optional, default=None
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
-
             If not specified, the model's ``expected_information`` attribute is used.
+
         store_covariance_matrix : boolean, optional, default=False
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.

From 7e86e3f4943f2d86b7dfc9e13d1f840043010a4e Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Wed, 31 Jan 2024 13:41:52 +0100
Subject: [PATCH 35/63] Do not fail when an estimator misses class members that
 are new in v3 (#757)

* do not fail on missing class members that are new in v3

* simplify

* convert

* shorten the comment

* simplify

* don't use getattr unnecessarily

* cosmetics

* fix unrelated typo
---
 src/glum/_glm.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 0c1d8b8c..312c243d 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -883,16 +883,16 @@ def _convert_from_pandas(
         if hasattr(self, "X_model_spec_"):
             return self.X_model_spec_.get_model_matrix(df, context=context)
 
-        cat_missing_method_after_alignment = self.cat_missing_method
+        cat_missing_method_after_alignment = getattr(self, "cat_missing_method", "fail")
 
         if hasattr(self, "feature_dtypes_"):
             df = _align_df_categories(
                 df,
                 self.feature_dtypes_,
-                self.has_missing_category_,
-                self.cat_missing_method,
+                getattr(self, "has_missing_category_", {}),
+                cat_missing_method_after_alignment,
             )
-            if self.cat_missing_method == "convert":
+            if cat_missing_method_after_alignment == "convert":
                 df = _add_missing_categories(
                     df=df,
                     dtypes=self.feature_dtypes_,
@@ -906,7 +906,9 @@ def _convert_from_pandas(
         X = tm.from_pandas(
             df,
             drop_first=self.drop_first,
-            categorical_format=self.categorical_format,
+            categorical_format=getattr(  # convention prior to v3
+                self, "categorical_format", "{name}__{category}"
+            ),
             cat_missing_method=cat_missing_method_after_alignment,
         )
 
@@ -1629,7 +1631,7 @@ def wald_test(
         )
         if num_lhs_specs != 1:
             raise ValueError(
-                "Exactly one of R, features terms or formula must be specified. "
+                "Exactly one of R, features, terms or formula must be specified. "
                 f"Received {num_lhs_specs} specifications."
             )
 
@@ -2724,7 +2726,8 @@ def _set_up_and_check_fit_args(
 
                 self.feature_dtypes_ = X.dtypes.to_dict()
                 self.has_missing_category_ = {
-                    col: (self.cat_missing_method == "convert") and X[col].isna().any()
+                    col: (getattr(self, "cat_missing_method", "fail") == "convert")
+                    and X[col].isna().any()
                     for col, dtype in self.feature_dtypes_.items()
                     if isinstance(dtype, pd.CategoricalDtype)
                 }
@@ -2784,9 +2787,11 @@ def _expand_categorical_penalties(
                 X = tm.from_pandas(
                     X,
                     drop_first=self.drop_first,
-                    categorical_format=self.categorical_format,
-                    cat_missing_method=self.cat_missing_method,
-                    cat_missing_name=self.cat_missing_name,
+                    categorical_format=getattr(  # convention prior to v3
+                        self, "categorical_format", "{name}__{category}"
+                    ),
+                    cat_missing_method=getattr(self, "cat_missing_method", "fail"),
+                    cat_missing_name=getattr(self, "cat_missing_name", "(MISSING)"),
                 )
 
         if y is None:

From 6816dad7923e17ad2639c4e255d3f0685c68fce3 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Wed, 31 Jan 2024 16:59:25 +0100
Subject: [PATCH 36/63] tiny cosmetics [skip ci]

---
 src/glum/_glm_cv.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index a76bf479..31fd58df 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -314,6 +314,7 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
         - if 'zero', missing values will represent all-zero indicator columns.
         - if 'convert', missing values will be converted to the ``cat_missing_name``
           category.
+
     cat_missing_name: str, default='(MISSING)'
         Name of the category to which missing values will be converted if
         ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.

From 137d9fb0f5c815558e2ab53af4591b1a4c65f905 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Thu, 1 Feb 2024 09:40:39 +0100
Subject: [PATCH 37/63] No regularization as default (#758)

* set alpha=0 as default

* fix docstring

* add alpha where needed to avoid LinAlgError

* add changelog entry

* also set alpha in golden master

* change name in persisted file too

* set alpha in model_parameters again

* don't modify case of no alpha attribute, which is RegressorCV

* remove invalid alpha argument

* wording
---
 CHANGELOG.rst                              |   4 +
 src/glum/_glm.py                           |  18 +--
 tests/glm/golden_master/simulation_gm.json |  40 +++----
 tests/glm/test_distribution.py             |   6 -
 tests/glm/test_glm.py                      | 122 ++++++++++-----------
 tests/glm/test_golden_master.py            |  34 +++---
 6 files changed, 114 insertions(+), 110 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 72c6933d..0f7b3090 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,6 +10,10 @@ Changelog
 3.0.0 - UNRELEASED
 ------------------
 
+**Breaking change:**
+
+- :class:`~glum.GeneralizedLinearRegressor`'s default value for `alpha` is now `0`, i.e. no regularization.
+
 **New features:**
 
 - Added a formula interface for specifying models.
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 312c243d..9c9fb53d 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2304,8 +2304,7 @@ def covariance_matrix(
             _expected_information = expected_information
 
         if (
-            (hasattr(self, "alpha") and self.alpha is None)
-            or (
+            (
                 hasattr(self, "alpha")
                 and isinstance(self.alpha, (int, float))
                 and self.alpha > 0
@@ -2914,11 +2913,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     alpha : {float, array-like}, optional (default=None)
         Constant that multiplies the penalty terms and thus determines the
         regularization strength. If ``alpha_search`` is ``False`` (the default),
-        then ``alpha`` must be a scalar or None (equivalent to ``alpha=1.0``).
+        then ``alpha`` must be a scalar or None (equivalent to ``alpha=0``).
         If ``alpha_search`` is ``True``, then ``alpha`` must be an iterable or
         ``None``. See ``alpha_search`` to find how the regularization path is
         set if ``alpha`` is ``None``. See the notes for the exact mathematical
-        meaning of this parameter. ``alpha = 0`` is equivalent to unpenalized
+        meaning of this parameter. ``alpha=0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix ``X`` must have full column rank
         (no collinearities).
 
@@ -3146,10 +3145,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
 
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
-        Set this to True when alpha=0 and solver='auto' to prevent an error due to a
-        singular feature matrix. In the case of using a formula with interactions,
-        setting this argument to ``True`` ensures structural full-rankness (it is
-        equivalent to ``ensure_full_rank`` in formulaic and tabmat).
+        Set this to True when ``alpha=0`` and ``solver='auto'`` to prevent an error
+        due to a singular feature matrix. In the case of using a formula with
+        interactions, setting this argument to ``True`` ensures structural
+        full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and
+        tabmat).
 
     robust : bool, optional (default = False)
         If true, then robust standard errors are computed by default.
@@ -3573,7 +3573,7 @@ def fit(
                 self.coef_ = self.coef_path_[-1]
         else:
             if self.alpha is None:
-                _alpha = 1.0
+                _alpha = 0.0
             else:
                 _alpha = self.alpha
             if _alpha > 0 and self.l1_ratio > 0 and self._solver != "irls-cd":
diff --git a/tests/glm/golden_master/simulation_gm.json b/tests/glm/golden_master/simulation_gm.json
index 3c69f59b..660413de 100644
--- a/tests/glm/golden_master/simulation_gm.json
+++ b/tests/glm/golden_master/simulation_gm.json
@@ -1,6 +1,6 @@
 {
   "normal": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.5027665204024282,
         0.23449539956055546,
@@ -36,7 +36,7 @@
       "intercept_": 3.026490229054092,
       "n_iter_": 1
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.5012056522046088,
         0.23528722263235485,
@@ -72,7 +72,7 @@
       "intercept_": 2.0279948791150764,
       "n_iter_": 1
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.49784759015593427,
         0.23166926058137094,
@@ -108,7 +108,7 @@
       "intercept_": 2.981778440705444,
       "n_iter_": 1
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.4985676422254175,
         0.22818569911229844,
@@ -1478,7 +1478,7 @@
     }
   },
   "poisson": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.9604408672344522,
         0.4432562524921413,
@@ -1514,7 +1514,7 @@
       "intercept_": 1.8189178943867188,
       "n_iter_": 6
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.9817372866211753,
         0.49117907395980553,
@@ -1550,7 +1550,7 @@
       "intercept_": 1.157828764208921,
       "n_iter_": 6
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.9693196874148616,
         0.46707910961062293,
@@ -1586,7 +1586,7 @@
       "intercept_": 1.8396971485658087,
       "n_iter_": 6
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.9821298947770232,
         0.4937841900606277,
@@ -2812,7 +2812,7 @@
     }
   },
   "gamma": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.4866808417045077,
         0.1370793228217412,
@@ -2848,7 +2848,7 @@
       "intercept_": 5.268950639816242,
       "n_iter_": 4
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.48972345202083134,
         0.24707128799109493,
@@ -2884,7 +2884,7 @@
       "intercept_": 2.512993119536852,
       "n_iter_": 4
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.5107634971640694,
         0.1783139942111257,
@@ -2920,7 +2920,7 @@
       "intercept_": 5.272870219406924,
       "n_iter_": 4
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.4966531683982075,
         0.24896254652599858,
@@ -4146,7 +4146,7 @@
     }
   },
   "tweedie_p=1.5": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.8740584736837378,
         0.39026903329437757,
@@ -4182,7 +4182,7 @@
       "intercept_": 2.8380327257627473,
       "n_iter_": 4
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.8592854961617753,
         0.42694459825027725,
@@ -4218,7 +4218,7 @@
       "intercept_": 1.6496674803774887,
       "n_iter_": 4
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.8763610403720393,
         0.4023951463085115,
@@ -4254,7 +4254,7 @@
       "intercept_": 2.7855262434295343,
       "n_iter_": 4
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.860178238544325,
         0.43000049156945763,
@@ -5480,7 +5480,7 @@
     }
   },
   "binomial": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.0645115293284631,
         0.03563706184469416,
@@ -5516,7 +5516,7 @@
       "intercept_": 3.3761974509366994,
       "n_iter_": 3
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.06396142685405831,
         0.03544619397195947,
@@ -5552,7 +5552,7 @@
       "intercept_": 2.007458821879875,
       "n_iter_": 2
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.059850128940604715,
         0.029620907232596274,
@@ -5588,7 +5588,7 @@
       "intercept_": 3.4202998674202676,
       "n_iter_": 3
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.05979957149348005,
         0.03233408720147587,
diff --git a/tests/glm/test_distribution.py b/tests/glm/test_distribution.py
index d241ff07..be2a694a 100644
--- a/tests/glm/test_distribution.py
+++ b/tests/glm/test_distribution.py
@@ -296,7 +296,6 @@ def test_poisson_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -7.390977 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="poisson",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -345,7 +344,6 @@ def test_gamma_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -7.057068 (df=2)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="gamma",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -393,7 +391,6 @@ def test_gaussian_deviance_dispersion_loglihood(family, weighted):
     # logLik(glm_model)  # -7.863404 (df=2)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family=family,
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -441,7 +438,6 @@ def test_tweedie_deviance_dispersion_loglihood(weighted):
     # logLiktweedie(glm_model)  # -8.35485
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family=TweedieDistribution(1.5),
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -490,7 +486,6 @@ def test_binomial_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -3.365058 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="binomial",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -535,7 +530,6 @@ def test_negative_binomial_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -4.187887 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="negative.binomial",
         fit_intercept=False,
         gradient_tol=1e-8,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 469f464e..08080212 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -203,7 +203,7 @@ def test_gradient_tol_setting(estimator, kwargs, solver, gradient_tol):
 )
 def test_glm_family_argument(f, fam, y, X):
     """Test GLM family argument set as string."""
-    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    glm = GeneralizedLinearRegressor(family=f).fit(X, y)
     assert isinstance(glm._family_instance, fam.__class__)
 
 
@@ -373,6 +373,7 @@ def test_P1_P2_expansion_with_categoricals_missings():
     y = rng.normal(size=60)
 
     mdl1 = GeneralizedLinearRegressor(
+        alpha=1.0,
         l1_ratio=0.01,
         P1=[1, 2, 2, 2, 2, 2],
         P2=[2, 1, 1, 1, 1, 1],
@@ -381,6 +382,7 @@ def test_P1_P2_expansion_with_categoricals_missings():
     mdl1.fit(X, y)
 
     mdl2 = GeneralizedLinearRegressor(
+        alpha=1.0,
         l1_ratio=0.01,
         P1=[1, 2],
         P2=[2, 1],
@@ -390,6 +392,7 @@ def test_P1_P2_expansion_with_categoricals_missings():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
     mdl3 = GeneralizedLinearRegressor(
+        alpha=1.0,
         l1_ratio=0.01,
         P1=[1, 2],
         P2=sparse.diags([2, 1, 1, 1, 1, 1]),
@@ -427,7 +430,10 @@ def test_glm_fit_intercept_argument(estimator, fit_intercept):
 )
 def test_glm_solver_argument(estimator, solver, l1_ratio, y, X):
     """Test GLM for invalid solver argument."""
-    glm = estimator(solver=solver, l1_ratio=l1_ratio)
+    kwargs = {"solver": solver, "l1_ratio": l1_ratio}
+    if estimator == GeneralizedLinearRegressor:
+        kwargs["alpha"] = 1.0
+    glm = estimator(**kwargs)
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
@@ -476,7 +482,10 @@ def test_glm_warm_start_argument(estimator, warm_start):
 def test_glm_warm_start_with_constant_column(estimator):
     X, y = make_regression()
     X[:, 0] = 0
-    glm = estimator(warm_start=True)
+    kwargs = {"warm_start": True}
+    if estimator == GeneralizedLinearRegressor:
+        kwargs["alpha"] = 1.0
+    glm = estimator(**kwargs)
     glm.fit(X, y)
     glm.fit(X, y)
 
@@ -575,7 +584,6 @@ def test_glm_identity_regression(solver, fit_intercept, offset, convert_x_fn):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef) + (0 if offset is None else offset)
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=fit_intercept,
@@ -695,7 +703,6 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef) + (0 if offset is None else offset)
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=fit_intercept,
@@ -737,7 +744,6 @@ def test_glm_identity_regression_categorical_data(solver, offset, convert_x_fn):
     y = np.dot(x_mat, coef) + (0 if offset is None else offset)
 
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=False,
@@ -776,7 +782,6 @@ def test_glm_log_regression(family, solver, tol, fit_intercept, offset):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef) + (0 if offset is None else offset))
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family=family,
         link="log",
         fit_intercept=fit_intercept,
@@ -1250,7 +1255,6 @@ def test_binomial_cloglog_unregularized(solver):
     sm_fit = sm_glm.fit()
 
     glum_glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="binomial",
         link="cloglog",
         solver=solver,
@@ -1312,11 +1316,11 @@ def test_binomial_enet(alpha):
 @pytest.mark.parametrize(
     "params",
     [
-        {"solver": "irls-ls"},
-        {"solver": "lbfgs"},
-        {"solver": "trust-constr"},
-        {"solver": "irls-cd", "selection": "cyclic"},
-        {"solver": "irls-cd", "selection": "random"},
+        {"solver": "irls-ls", "alpha": 1.0},
+        {"solver": "lbfgs", "alpha": 1.0},
+        {"solver": "trust-constr", "alpha": 1.0},
+        {"solver": "irls-cd", "selection": "cyclic", "alpha": 1.0},
+        {"solver": "irls-cd", "selection": "random", "alpha": 1.0},
     ],
     ids=lambda params: ", ".join(f"{key}={val}" for key, val in params.items()),
 )
@@ -1328,7 +1332,7 @@ def test_solver_equivalence(params, use_offset, regression_data):
         offset = np.random.random(len(y))
     else:
         offset = None
-    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref = GeneralizedLinearRegressor(random_state=2, alpha=1.0)
     est_ref.fit(X, y, offset=offset)
 
     est_2 = GeneralizedLinearRegressor(**params)
@@ -1803,7 +1807,7 @@ def test_passing_noncontiguous_as_X():
 )
 def test_feature_names_underscores(X, feature_names):
     model = GeneralizedLinearRegressor(
-        family="poisson", categorical_format="{name}__{category}"
+        family="poisson", categorical_format="{name}__{category}", alpha=1.0
     ).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
 
@@ -1850,7 +1854,7 @@ def test_feature_names_underscores(X, feature_names):
 )
 def test_feature_names_brackets(X, feature_names):
     model = GeneralizedLinearRegressor(
-        family="poisson", categorical_format="{name}[{category}]"
+        family="poisson", categorical_format="{name}[{category}]", alpha=1.0
     ).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
 
@@ -1891,7 +1895,7 @@ def test_feature_names_brackets(X, feature_names):
     ],
 )
 def test_term_names(X, term_names):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "term_names_", None), term_names)
 
 
@@ -1907,7 +1911,7 @@ def test_term_names(X, term_names):
     ],
 )
 def test_feature_dtypes(X, dtypes):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_dtypes_", None), dtypes)
 
 
@@ -1930,12 +1934,12 @@ def test_categorical_types(k, n):
 
     # use categorical types
     X_cat = pd.DataFrame({"group": pd.Categorical(group, categories=categories)})
-    model_cat = GeneralizedLinearRegressor(family="poisson").fit(X_cat, y)
+    model_cat = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_cat, y)
     pred_cat = model_cat.predict(X_cat)
 
     # use one-hot encoding
     X_oh = pd.get_dummies(X_cat, dtype=float)
-    model_oh = GeneralizedLinearRegressor(family="poisson").fit(X_oh, y)
+    model_oh = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_oh, y)
     pred_oh = model_oh.predict(X_oh)
 
     # check predictions
@@ -1986,7 +1990,7 @@ def test_verbose(regression_data, capsys):
 
 def test_ols_std_errors(regression_data):
     X, y = regression_data
-    mdl = GeneralizedLinearRegressor(alpha=0, family="normal")
+    mdl = GeneralizedLinearRegressor(family="normal")
     mdl.fit(X=X, y=y)
 
     mdl_sm = sm.OLS(endog=y, exog=sm.add_constant(X))
@@ -2029,9 +2033,9 @@ def test_array_std_errors(regression_data, family, fit_intercept):
         sm_family = sm.families.Gaussian()
         dispersion = None
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family=family, fit_intercept=fit_intercept
-    ).fit(X=X, y=y)
+    mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit(
+        X=X, y=y
+    )
 
     if fit_intercept:
         mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family)
@@ -2063,7 +2067,7 @@ def test_array_std_errors(regression_data, family, fit_intercept):
 def test_sparse_std_errors(regression_data):
     X, y = regression_data
     sp_X = sparse.csc_matrix(X)
-    mdl = GeneralizedLinearRegressor(alpha=0, family="normal")
+    mdl = GeneralizedLinearRegressor(family="normal")
     mdl.fit(X=X, y=y)
 
     actual1 = mdl.std_errors(X=sp_X, y=y, robust=False)
@@ -2105,9 +2109,7 @@ def test_inputtype_std_errors(regression_data, categorical, split, fit_intercept
                     tm.CategoricalMatrix(pd.Categorical(group, categories=categories)),
                 ]
             )
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="normal", fit_intercept=fit_intercept
-    )
+    mdl = GeneralizedLinearRegressor(family="normal", fit_intercept=fit_intercept)
     mdl.fit(X=X, y=y)
     if isinstance(X, tm.MatrixBase):
         X_sm = X.toarray()
@@ -2146,7 +2148,7 @@ def test_coef_table(regression_data, fit_intercept, confidence_level):
     X_df = pd.DataFrame(X, columns=colnames)
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=fit_intercept
+        family="gaussian", fit_intercept=fit_intercept
     ).fit(X=X_df, y=y)
 
     if fit_intercept:
@@ -2209,9 +2211,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r):
         sm_family = sm.families.Gaussian()
         dispersion = None
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family=family, fit_intercept=fit_intercept
-    ).fit(X=X, y=y)
+    mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit(
+        X=X, y=y
+    )
 
     if fit_intercept:
         mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family)
@@ -2283,9 +2285,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r):
 def test_wald_test_matrix_public(regression_data, R, r):
     X, y = regression_data
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X, y=y, store_covariance_matrix=True
+    )
 
     assert mdl._wald_test_matrix(R, r) == mdl.wald_test(R=R, r=r)
 
@@ -2306,9 +2308,9 @@ def test_wald_test_matrix_public(regression_data, R, r):
 def test_wald_test_matrix_fixed_cov(regression_data, R, r):
     X, y = regression_data
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=False
-    ).fit(X=X, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=False).fit(
+        X=X, y=y, store_covariance_matrix=True
+    )
     mdl_sm = sm.GLM(endog=y, exog=X, family=sm.families.Gaussian())
 
     # Use the same covariance matrix for both so that we can use tighter tolerances
@@ -2351,9 +2353,9 @@ def test_wald_test_feature_names(regression_data, names, R, r):
     X, y = regression_data
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X_df, y=y, store_covariance_matrix=True
+    )
 
     feature_names_results = mdl._wald_test_feature_names(names, r)
     if r is not None:
@@ -2392,9 +2394,9 @@ def test_wald_test_feature_names_public(regression_data, names, r):
     X, y = regression_data
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X_df, y=y, store_covariance_matrix=True
+    )
 
     assert mdl._wald_test_feature_names(names, r) == mdl.wald_test(features=names, r=r)
 
@@ -2449,7 +2451,7 @@ def test_wald_test_term_names(regression_data, names, R, r, r_feat):
     X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl._wald_test_term_names(names, r)
@@ -2515,7 +2517,7 @@ def test_wald_test_term_names_public(regression_data, names, R, r, r_feat):
     X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl.wald_test(terms=names, r=r)
@@ -2554,7 +2556,7 @@ def test_wald_test_formula(regression_data, formula, R, r_feat):
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl._wald_test_formula(formula)
@@ -2599,7 +2601,7 @@ def test_wald_test_formula_public(regression_data, formula, R, r_feat):
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True, drop_first=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
     term_names_results = mdl.wald_test(formula=formula)
@@ -2617,7 +2619,7 @@ def test_wald_test_formula_public(regression_data, formula, R, r_feat):
 
 def test_wald_test_raise_on_wrong_input(regression_data):
     X, y = regression_data
-    mdl = GeneralizedLinearRegressor(alpha=0, family="gaussian", fit_intercept=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True)
     mdl.fit(X=X, y=y)
 
     with pytest.raises(ValueError):
@@ -2632,7 +2634,6 @@ def test_wald_test_raise_on_wrong_input(regression_data):
 @pytest.mark.parametrize("weighted", [False, True])
 def test_score_method(as_data_frame, offset, weighted):
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -2666,7 +2667,7 @@ def test_score_method(as_data_frame, offset, weighted):
 
 def test_information_criteria(regression_data):
     X, y = regression_data
-    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor = GeneralizedLinearRegressor(family="gaussian")
     regressor.fit(X, y)
 
     llf = regressor.family_instance.log_likelihood(y, regressor.predict(X))
@@ -2721,10 +2722,10 @@ def test_drop_first_allows_alpha_equals_0():
     rng = np.random.default_rng(42)
     y = np.random.normal(size=10)
     X = pd.DataFrame(data={"cat": pd.Categorical(rng.integers(2, size=10))})
-    regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
+    regressor = GeneralizedLinearRegressor(drop_first=True)
     regressor.fit(X, y)
 
-    regressor = GeneralizedLinearRegressor(alpha=0)  # default is False
+    regressor = GeneralizedLinearRegressor()  # default is False
     with pytest.raises(np.linalg.LinAlgError):
         regressor.fit(X, y)
 
@@ -2732,7 +2733,7 @@ def test_drop_first_allows_alpha_equals_0():
 def test_dropping_distinct_categorical_column():
     y = np.random.normal(size=10)
     X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10)), "num": np.ones(10)})
-    regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
+    regressor = GeneralizedLinearRegressor(drop_first=True)
     regressor.fit(X, y)
     assert regressor.coef_.shape == (1,)
     assert regressor.feature_names_ == ["num"]
@@ -2769,7 +2770,6 @@ def test_store_covariance_matrix(
 
     regressor = GeneralizedLinearRegressor(
         family="gaussian",
-        alpha=0,
         robust=robust,
         expected_information=expected_information,
     )
@@ -2804,7 +2804,6 @@ def test_store_covariance_matrix_formula(regression_data, formula):
     regressor = GeneralizedLinearRegressor(
         formula=formula,
         family="gaussian",
-        alpha=0,
     )
     regressor.fit(df, y, store_covariance_matrix=True)
 
@@ -2827,7 +2826,6 @@ def test_store_covariance_matrix_formula_errors(regression_data):
     regressor = GeneralizedLinearRegressor(
         formula=formula,
         family="gaussian",
-        alpha=0,
     )
     regressor.fit(df, y)
     with pytest.raises(ValueError, match="Either X and y must be provided"):
@@ -2837,7 +2835,7 @@ def test_store_covariance_matrix_formula_errors(regression_data):
 def test_store_covariance_matrix_errors(regression_data):
     X, y = regression_data
 
-    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor = GeneralizedLinearRegressor(family="gaussian")
     regressor.fit(X, y, store_covariance_matrix=False)
 
     with pytest.raises(ValueError, match="Either X and y must be provided"):
@@ -3022,6 +3020,7 @@ def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
         formula=formula,
         fit_intercept=fit_intercept,
         categorical_format="{name}[T.{category}]",
+        alpha=1.0,
     ).fit(data)
 
     if fit_intercept:
@@ -3041,6 +3040,7 @@ def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
         drop_first=drop_first,
         fit_intercept=fit_intercept,
         categorical_format="{name}[T.{category}]",
+        alpha=1.0,
     ).fit(X_ext, y_ext)
 
     np.testing.assert_almost_equal(model_ext.coef_, model_formula.coef_)
@@ -3091,6 +3091,7 @@ def test_formula_names_formulaic_style(
         formula=formula,
         categorical_format="{name}[T.{category}]",
         interaction_separator=":",
+        alpha=1.0,
     ).fit(data)
 
     np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
@@ -3128,6 +3129,7 @@ def test_formula_names_old_glum_style(
         formula=formula,
         categorical_format="{name}__{category}",
         interaction_separator="__x__",
+        alpha=1.0,
     ).fit(data)
 
     np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
@@ -3151,7 +3153,6 @@ def test_formula_against_smf(get_mixed_data, formula, fit_intercept):
         family="normal",
         drop_first=True,
         formula=formula,
-        alpha=0.0,
         fit_intercept=fit_intercept,
     ).fit(data)
 
@@ -3174,7 +3175,6 @@ def test_formula_context(get_mixed_data):
         family="normal",
         drop_first=True,
         formula=formula,
-        alpha=0.0,
         fit_intercept=True,
     ).fit(data)
 
@@ -3206,7 +3206,6 @@ def test_formula_predict(get_mixed_data, formula, fit_intercept):
         family="normal",
         drop_first=True,
         formula=formula,
-        alpha=0.0,
         fit_intercept=fit_intercept,
     ).fit(data)
 
@@ -3245,6 +3244,7 @@ def test_cat_missing(cat_missing_method, unseen_missing, formula):
         drop_first=False,
         formula=formula,
         fit_intercept=False,
+        alpha=1.0,
     )
     if cat_missing_method == "fail" and not unseen_missing:
         with pytest.raises(
diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index ac00f91b..4f70c6a7 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -100,25 +100,33 @@ def expected_all():
 
 
 gm_model_parameters = {
-    "default": {},  # default params
-    "half-regularization": {"alpha": 0.5},  # regularization (other than alpha = 1)
-    "elastic-net": {"l1_ratio": 0.5},  # elastic-net
-    "lasso": {"l1_ratio": 1},  # lasso
+    # TODO add an unregularized case
+    "regularization": {"alpha": 1.0},  # default prior to v3
+    "half-regularization": {"alpha": 0.5},  # regularization with alpha = 0.5
+    "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0},  # elastic-net
+    "lasso": {"l1_ratio": 1, "alpha": 1.0},  # lasso
     "variable_p1": {
         "l1_ratio": 1,
         "P1": np.arange(30) / 10,
+        "alpha": 1.0,
     },  # lasso with variable penalty
     "variable_p2": {
         "l1_ratio": 0,
         "P2": _make_P2(),
+        "alpha": 1.0,
     },  # ridge with Tikhonov regularization
     "variable_p1_p2": {
         "l1_ratio": 0.5,
         "P1": np.arange(30) / 10,
         "P2": _make_P2(),
+        "alpha": 1.0,
     },  # elastic net with P1 and P2 variable penalty
-    "fit_intercept": {"fit_intercept": False},  # do not fit the intercept
-    "bounds": {"lower_bounds": np.full(30, 0), "upper_bounds": np.full(30, 0.4)},
+    "fit_intercept": {"fit_intercept": False, "alpha": 1.0},  # do not fit the intercept
+    "bounds": {
+        "lower_bounds": np.full(30, 0),
+        "upper_bounds": np.full(30, 0.4),
+        "alpha": 1.0,
+    },
 }
 
 
@@ -207,13 +215,13 @@ def test_gm_storage(distribution, data_all_storage, expected_all):
     model = fit_model(
         data=data,
         family=distribution,
-        model_parameters={},
+        model_parameters={"alpha": 1.0},
         use_weights=False,
         use_offset=False,
         cv=False,
     )
 
-    run_name = "default"
+    run_name = "regularization"
     expected = expected_all[distribution][run_name]
 
     assert_gm_allclose(model, expected)
@@ -226,7 +234,7 @@ def test_gm_custom_link(family_link, use_weights, use_offset, data_all, expected
     """Currently only testing log-linear model."""
     distribution, link = family_link
     data = data_all[distribution]
-    model_parameters = {"link": link}
+    model_parameters = {"link": link, "alpha": 1.0}
     model = fit_model(
         data=data,
         family=distribution,
@@ -257,9 +265,7 @@ def test_gm_approx_hessian(
     distribution, use_weights, use_offset, data_all, expected_all
 ):
     data = data_all[distribution]
-    model_parameters = {
-        "hessian_approx": 0.1,
-    }
+    model_parameters = {"hessian_approx": 0.1, "alpha": 1.0}
     model = fit_model(
         data=data,
         family=distribution,
@@ -269,7 +275,7 @@ def test_gm_approx_hessian(
         cv=False,
     )
 
-    run_name = "default"
+    run_name = "regularization"
     if use_weights:
         run_name = f"{run_name}_weights"
     if use_offset:
@@ -445,7 +451,7 @@ def run_and_store_golden_master(
             for use_offset in [True, False]:
                 gm_dict = run_and_store_golden_master(
                     distribution=dist,
-                    model_parameters={"link": link},
+                    model_parameters={"link": link, "alpha": 1.0},
                     run_name=f"custom-{dist}-{link}",
                     use_weights=use_weights,
                     use_offset=use_offset,

From 4af7de69106155161407dbac048b419619cdcb89 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@gmail.com>
Date: Thu, 1 Feb 2024 14:51:01 +0100
Subject: [PATCH 38/63] Improve code readability

---
 src/glum/_glm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 9c9fb53d..e490550d 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -940,7 +940,9 @@ def _set_up_for_fit(self, y: np.ndarray) -> None:
             elif (self.lower_bounds is None) and (self.upper_bounds is None):
                 if np.all(np.asarray(self.l1_ratio) == 0):
                     self._solver = "irls-ls"
-                elif getattr(self, "alpha", 1) == 0 and not self.alpha_search:
+                elif (
+                    hasattr(self, "alpha") and self.alpha == 0 and not self.alpha_search
+                ):
                     self._solver = "irls-ls"
                 else:
                     self._solver = "irls-cd"

From 3ce7fc0e4186fdf92a46f7a29471b23a8266404c Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Tue, 20 Feb 2024 19:20:16 +0100
Subject: [PATCH 39/63] Make arguments to public methods except `X`, `y`,
 `sample_weight` and `offset` keyword-only and make initialization
 keyword-only (#764)

* make all args except X, y, sample_weight, offset keyword only; make initialization keyword only

* add changelog [skip ci]

* mention that also RegressorBase was changed [skip ci]
---
 CHANGELOG.rst         |   4 +-
 src/glum/_glm.py      | 157 +++++++++++++++++++++++-------------------
 src/glum/_glm_cv.py   |   6 +-
 tests/glm/test_glm.py |   6 +-
 4 files changed, 97 insertions(+), 76 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0f7b3090..58d9e4f7 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,8 +10,10 @@ Changelog
 3.0.0 - UNRELEASED
 ------------------
 
-**Breaking change:**
+**Breaking changes:**
 
+- All arguments to :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor`, and :class:`GeneralizedLinearRegressorCV` are now keyword-only.
+- All arguments to public methods of :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` or :class:`GeneralizedLinearRegressorCV` except `X`, `y`, `sample_weight`, and `offset` are now keyword-only.
 - :class:`~glum.GeneralizedLinearRegressor`'s default value for `alpha` is now `0`, i.e. no regularization.
 
 **New features:**
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index bb9b6ae9..67eb8332 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -730,6 +730,7 @@ class GeneralizedLinearRegressorBase(BaseEstimator, RegressorMixin):
 
     def __init__(
         self,
+        *,
         l1_ratio: float = 0,
         P1="identity",
         P2: Union[str, np.ndarray, sparse.spmatrix] = "identity",
@@ -1204,7 +1205,7 @@ def _solve_regularization_path(
         return self.coef_path_
 
     def report_diagnostics(
-        self, full_report: bool = False, custom_columns: Optional[Iterable] = None
+        self, *, full_report: bool = False, custom_columns: Optional[Iterable] = None
     ) -> None:
         """Print diagnostics to ``stdout``.
 
@@ -1218,7 +1219,9 @@ def report_diagnostics(
         custom_columns : iterable, optional (default=None)
             Print only the specified columns.
         """
-        diagnostics = self.get_formatted_diagnostics(full_report, custom_columns)
+        diagnostics = self.get_formatted_diagnostics(
+            full_report=full_report, custom_columns=custom_columns
+        )
         if isinstance(diagnostics, str):
             print(diagnostics)
             return
@@ -1230,9 +1233,9 @@ def report_diagnostics(
             print(diagnostics)
 
     def get_formatted_diagnostics(
-        self, full_report: bool = False, custom_columns: Optional[Iterable] = None
+        self, *, full_report: bool = False, custom_columns: Optional[Iterable] = None
     ) -> Union[str, pd.DataFrame]:
-        """Get formatted diagnostics; can be printed with _report_diagnostics.
+        """Get formatted diagnostics which can be printed with report_diagnostics.
 
         Parameters
         ----------
@@ -1290,6 +1293,7 @@ def linear_predictor(
         self,
         X: ArrayLike,
         offset: Optional[ArrayLike] = None,
+        *,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
         context: Optional[Union[int, Mapping[str, Any]]] = 0,
@@ -1378,6 +1382,7 @@ def predict(
         X: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
         context: Optional[Union[int, Mapping[str, Any]]] = 0,
@@ -1437,12 +1442,13 @@ def predict(
 
     def coef_table(
         self,
-        confidence_level=0.95,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        *,
+        confidence_level=0.95,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -1540,16 +1546,17 @@ def coef_table(
 
     def wald_test(
         self,
+        X=None,
+        y=None,
+        sample_weight=None,
+        offset=None,
+        *,
         R: Optional[np.ndarray] = None,
         features: Optional[Union[str, list[str]]] = None,
         terms: Optional[Union[str, list[str]]] = None,
         formula: Optional[str] = None,
         r: Optional[Sequence] = None,
-        X=None,
-        y=None,
         mu=None,
-        offset=None,
-        sample_weight=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -1572,6 +1579,16 @@ def wald_test(
 
         Parameters
         ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
+            Training data. Can be omitted if a covariance matrix has already
+            been computed.
+        y : array-like, shape (n_samples,), optional
+            Target values. Can be omitted if a covariance matrix has already
+            been computed.
+        sample_weight : array-like, shape (n_samples,), optional, default=None
+            Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
         R : np.ndarray, optional, default=None
             The restriction matrix representing the linear combination of coefficients
             to test.
@@ -1588,18 +1605,8 @@ def wald_test(
             The vector representing the values of the linear combination.
             If None, the test is for whether the linear combinations of the coefficients
             are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
         mu : array-like, optional, default=None
             Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -1647,9 +1654,9 @@ def wald_test(
                 r=r,
                 X=X,
                 y=y,
-                mu=mu,
-                offset=offset,
                 sample_weight=sample_weight,
+                offset=offset,
+                mu=mu,
                 dispersion=dispersion,
                 robust=robust,
                 clusters=clusters,
@@ -1663,9 +1670,9 @@ def wald_test(
                 values=r,
                 X=X,
                 y=y,
-                mu=mu,
-                offset=offset,
                 sample_weight=sample_weight,
+                offset=offset,
+                mu=mu,
                 dispersion=dispersion,
                 robust=robust,
                 clusters=clusters,
@@ -1679,9 +1686,9 @@ def wald_test(
                 values=r,
                 X=X,
                 y=y,
-                mu=mu,
-                offset=offset,
                 sample_weight=sample_weight,
+                offset=offset,
+                mu=mu,
                 dispersion=dispersion,
                 robust=robust,
                 clusters=clusters,
@@ -1696,9 +1703,9 @@ def wald_test(
                 formula=formula,
                 X=X,
                 y=y,
-                mu=mu,
-                offset=offset,
                 sample_weight=sample_weight,
+                offset=offset,
+                mu=mu,
                 dispersion=dispersion,
                 robust=robust,
                 clusters=clusters,
@@ -1714,9 +1721,9 @@ def _wald_test_matrix(
         r: Optional[np.ndarray] = None,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -1744,12 +1751,12 @@ def _wald_test_matrix(
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -1774,9 +1781,9 @@ def _wald_test_matrix(
         covariance_matrix = self.covariance_matrix(
             X=X,
             y=y,
-            mu=mu,
-            offset=offset,
             sample_weight=sample_weight,
+            offset=offset,
+            mu=mu,
             dispersion=dispersion,
             robust=robust,
             clusters=clusters,
@@ -1822,9 +1829,9 @@ def _wald_test_feature_names(
         values: Optional[Sequence] = None,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -1849,12 +1856,12 @@ def _wald_test_feature_names(
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -1906,9 +1913,9 @@ def _wald_test_feature_names(
             r=r,
             X=X,
             y=y,
-            mu=mu,
-            offset=offset,
             sample_weight=sample_weight,
+            offset=offset,
+            mu=mu,
             dispersion=dispersion,
             robust=robust,
             clusters=clusters,
@@ -1921,9 +1928,9 @@ def _wald_test_formula(
         formula: str,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -1945,12 +1952,12 @@ def _wald_test_formula(
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -1986,9 +1993,9 @@ def _wald_test_formula(
             r=r,
             X=X,
             y=y,
-            mu=mu,
-            offset=offset,
             sample_weight=sample_weight,
+            offset=offset,
+            mu=mu,
             dispersion=dispersion,
             robust=robust,
             clusters=clusters,
@@ -2002,9 +2009,9 @@ def _wald_test_term_names(
         values: Optional[Sequence] = None,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -2034,12 +2041,12 @@ def _wald_test_term_names(
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
         sample_weight : array-like, shape (n_samples,), optional (default=None)
             Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -2101,9 +2108,9 @@ def _wald_test_term_names(
             r=r,
             X=X,
             y=y,
-            mu=mu,
-            offset=offset,
             sample_weight=sample_weight,
+            offset=offset,
+            mu=mu,
             dispersion=dispersion,
             robust=robust,
             clusters=clusters,
@@ -2115,9 +2122,10 @@ def std_errors(
         self,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        *,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
@@ -2138,12 +2146,12 @@ def std_errors(
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -2172,9 +2180,9 @@ def std_errors(
             self.covariance_matrix(
                 X=X,
                 y=y,
-                mu=mu,
-                offset=offset,
                 sample_weight=sample_weight,
+                offset=offset,
+                mu=mu,
                 dispersion=dispersion,
                 robust=robust,
                 clusters=clusters,
@@ -2188,9 +2196,10 @@ def covariance_matrix(
         self,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        *,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: Optional[np.ndarray] = None,
@@ -2480,6 +2489,7 @@ def score(
         y: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """Compute :math:`D^2`, the percentage of deviance explained.
@@ -3238,6 +3248,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
 
     def __init__(
         self,
+        *,
         alpha=None,
         l1_ratio=0,
         P1="identity",
@@ -3362,6 +3373,7 @@ def fit(
         y: Optional[ArrayLike] = None,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         store_covariance_matrix: bool = False,
         clusters: Optional[np.ndarray] = None,
         # TODO: take out weights_sum (or use it properly)
@@ -3707,6 +3719,7 @@ def aic(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        *,
         context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """
@@ -3745,6 +3758,7 @@ def aicc(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        *,
         context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """
@@ -3790,6 +3804,7 @@ def bic(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        *,
         context: Optional[Union[int, Mapping[str, Any]]] = 0,
     ):
         """
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 31fd58df..c84fb6f3 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -322,6 +322,7 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
 
     def __init__(
         self,
+        *,
         l1_ratio=0,
         P1="identity",
         P2="identity",
@@ -424,6 +425,7 @@ def fit(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         store_covariance_matrix: bool = False,
         clusters: Optional[np.ndarray] = None,
         context: Optional[Union[int, Mapping[str, Any]]] = None,
@@ -531,7 +533,7 @@ def fit(
         else:
             _stype = ["csc", "csr"]
 
-        def fit_path(
+        def _fit_path(
             self,
             train_idx,
             test_idx,
@@ -665,7 +667,7 @@ def _get_deviance(coef):
             return intercept_path_, coef_path_, deviance_path_
 
         jobs = (
-            delayed(fit_path)(
+            delayed(_fit_path)(
                 self,
                 train_idx=train_idx,
                 test_idx=test_idx,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 08080212..4645f2c2 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -616,7 +616,9 @@ def test_get_diagnostics(
     glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept, solver=solver)
     res = glm.fit(X, y)
 
-    diagnostics = res.get_formatted_diagnostics(full_report, custom_columns)
+    diagnostics = res.get_formatted_diagnostics(
+        full_report=full_report, custom_columns=custom_columns
+    )
     if solver in ("lbfgs", "trust-constr"):
         assert diagnostics == "solver does not report diagnostics"
     else:
@@ -678,7 +680,7 @@ def test_report_diagnostics(
 
     f = io.StringIO()
     with redirect_stdout(f):
-        res.report_diagnostics(full_report, custom_columns)
+        res.report_diagnostics(full_report=full_report, custom_columns=custom_columns)
     printed = f.getvalue()
     # Something should be printed
     assert len(printed) > 0

From b72379aa1be1c7ecdaca0c606bcbdad8726ac0d1 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Tue, 20 Feb 2024 19:35:33 +0100
Subject: [PATCH 40/63] fix import

---
 src/glum/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum/_util.py b/src/glum/_util.py
index 451dd9e8..b27d4fd8 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -1,6 +1,6 @@
 import logging
-from collections.abc import Sequence
 import warnings
+from collections.abc import Sequence
 from functools import wraps
 from typing import Union
 

From 1978e122e074cdbc5c3886293a8c2dbf551c57e6 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Tue, 20 Feb 2024 19:47:34 +0100
Subject: [PATCH 41/63] clean up changelog

---
 CHANGELOG.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 5a6532d5..81599838 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -24,6 +24,7 @@ Changelog
 - Added more options for treating missing values in categorical columns. They can either raise a `ValueError` (`"fail"`), be treated as all-zero indicators (`"zero"`) or represented as a new category (`"convert"`).
 - `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
 
+
 2.7.0 - 2024-02-19
 ------------------
 
@@ -57,7 +58,7 @@ Changelog
 
 - When computing the covariance matrix, check whether the design matrix is ill-conditioned for all types of input. Furthermore, do it in a more efficient way.
 - Pin ``tabmat<4.0.0`` (the new release will bring breaking changes).
-- Added the option to specify models using Wilkinson-formulas.
+
 
 2.5.2 - 2023-06-02
 ------------------
@@ -90,6 +91,7 @@ Changelog
   :class:`~glum.GeneralizedLinearRegressor` and :class:`~glum.GeneralizedLinearRegressorCV`
   to ``'negative.binomial'``.
 
+
 2.4.1 - 2023-03-14
 ------------------
 

From e948e6a5adb8c5b611ac34d8699e664e6b62e6d5 Mon Sep 17 00:00:00 2001
From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:58:31 +0000
Subject: [PATCH 42/63] Restructure distributions (#768)

---
 CHANGELOG.rst                  |    6 +-
 pyproject.toml                 |    2 +-
 src/glum/_distribution.py      | 1114 +++++++++++++++++++-------------
 src/glum/_functions.pyx        |   92 ++-
 src/glum/_glm.py               |   73 +--
 src/glum/_link.py              |  506 +++++----------
 tests/glm/test_distribution.py |   34 +-
 tests/glm/test_link.py         |   14 +-
 8 files changed, 963 insertions(+), 878 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 81599838..916cd007 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -15,14 +15,18 @@ Changelog
 - All arguments to :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor`, and :class:`GeneralizedLinearRegressorCV` are now keyword-only.
 - All arguments to public methods of :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` or :class:`GeneralizedLinearRegressorCV` except `X`, `y`, `sample_weight`, and `offset` are now keyword-only.
 - :class:`~glum.GeneralizedLinearRegressor`'s default value for `alpha` is now `0`, i.e. no regularization.
+- :class:`~glum.GammaDistribution`, :class:`~glum.InverseGaussianDistribution`, :class:`~glum.NormalDistribution` and :class:`~glum.PoissonDistribution` no longer inherit from :class:`~glum.TweedieDistribution`.
+- The power parameter of :class:`~glum.TweedieLink` has been renamed from ``p`` to ``power``, in line with :class:`~glum.TweedieDistribution`.
+- :class:`~glum.TweedieLink` no longer instantiates :class:`~glum.IdentityLink` or :class:`~glum.LogLink` for ``power=0`` and ``power=1``, respectively. On the other hand, :class:`~glum.TweedieLink` is now compatible with ``power=0`` and ``power=1``.
 
 **New features:**
 
 - Added a formula interface for specifying models.
-- Improved feature name handling. Feature names are now created for non-pandas input matrices, too. Furthermore, the format of categorical features can be specified by the user.
+- Improved feature name handling. Feature names are now created for non-pandas input matrices too. Furthermore, the format of categorical features can be specified by the user.
 - Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
 - Added more options for treating missing values in categorical columns. They can either raise a `ValueError` (`"fail"`), be treated as all-zero indicators (`"zero"`) or represented as a new category (`"convert"`).
 - `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
+- :class:`~glum.InverseGaussianDistribution` gains a :meth:`~glum.InverseGaussianDistribution.log_likelihood` method.
 
 
 2.7.0 - 2024-02-19
diff --git a/pyproject.toml b/pyproject.toml
index a69a904a..fc6dc0a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ default_section = 'THIRDPARTY'
 skip = [
   "*-win32",
   "*-manylinux_i686",
-  "pp*", 
+  "pp*",
   "*-musllinux_*",
   "cp36*",
   "cp37*",
diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
index edfd9136..f7facbd5 100644
--- a/src/glum/_distribution.py
+++ b/src/glum/_distribution.py
@@ -14,6 +14,10 @@
     gamma_log_eta_mu_deviance,
     gamma_log_likelihood,
     gamma_log_rowwise_gradient_hessian,
+    inv_gaussian_deviance,
+    inv_gaussian_log_eta_mu_deviance,
+    inv_gaussian_log_likelihood,
+    inv_gaussian_log_rowwise_gradient_hessian,
     negative_binomial_deviance,
     negative_binomial_log_eta_mu_deviance,
     negative_binomial_log_likelihood,
@@ -38,45 +42,23 @@
 class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
-    The PDF of :math:`Y \sim \mathrm{EDM}(\mu, \phi)` is given by
+    The PDF of :math:`Y \sim \mathrm{EDM}(\theta, \phi)` is given by
 
     .. math::
 
         p(y \mid \theta, \phi)
-        &= c(y, \phi) \exp((\theta y - A(\theta)_ / \phi) \\
-        &= \tilde{c}(y, \phi) \exp(-d(y, \mu) / (2\phi))
+        &= \exp \left(\frac{y \theta - b(\theta)}{\phi / w} + c(y; w / \phi) \right),
 
-    with mean :math:`\mathrm{E}(Y) = A'(\theta) = \mu`, variance
-    :math:`\mathrm{var}(Y) = \phi \cdot v(\mu)`, unit variance
-    :math:`v(\mu)` and unit deviance :math:`d(y, \mu)`.
+    where :math:`\theta` is the scale parameter, :math:`\phi` is the dispersion
+    parameter, :math:`w` is a given weight, :math:`b` is the cumulant function
+    and :math:`c` is a normalization term.
 
-    Properties
-    ----------
-    lower_bound
-    upper_bound
-    include_lower_bound
-    include_upper_bound
-
-    Methods
-    -------
-    in_y_range
-    unit_variance
-    unit_variance_derivative
-    variance
-    variance_derivative
-    unit_deviance
-    unit_deviance_derivative
-    deviance
-    deviance_derivative
-    starting_mu
-
-    _mu_deviance_derivative
-    eta_mu_deviance
-    gradient_hessian
+    It can be shown that :math:`\mathrm{E}(Y) = b'(\theta)` and
+    :math:`\mathrm{var}(Y) = b''(\theta) \times \phi / w`.
 
     References
     ----------
-    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    < https://en.wikipedia.org/wiki/Exponential_dispersion_model >.
     """
 
     @property
@@ -104,51 +86,41 @@ def include_upper_bound(self) -> bool:
         pass
 
     def in_y_range(self, x) -> np.ndarray:
-        """Return ``True`` if ``x`` is in the valid range of the EDM.
-
-        Parameters
-        ----------
-        x : array-like, shape (n_samples,)
-            Target values.
-
-        Returns
-        -------
-        np.ndarray
-        """
+        """Return ``True`` if ``x`` is in the valid range of the EDM."""
         if self.include_lower_bound:
-            if self.include_upper_bound:
-                return np.logical_and(
-                    np.greater_equal(x, self.lower_bound),
-                    np.less_equal(x, self.upper_bound),
-                )
-            else:
-                return np.logical_and(
-                    np.greater_equal(x, self.lower_bound), np.less(x, self.upper_bound)
-                )
+            lb_op = np.greater_equal
         else:
-            if self.include_upper_bound:
-                return np.logical_and(
-                    np.greater(x, self.lower_bound), np.less_equal(x, self.upper_bound)
-                )
-            else:
-                return np.logical_and(
-                    np.greater(x, self.lower_bound), np.less(x, self.upper_bound)
-                )
+            lb_op = np.greater
+
+        if self.include_upper_bound:
+            ub_op = np.less_equal
+        else:
+            ub_op = np.less
+
+        return lb_op(x, self.lower_bound) & ub_op(x, self.upper_bound)
+
+    def to_tweedie(self, safe=True):
+        """Return the Tweedie representation of a distribution if it exists."""
+        if hasattr(self, "__tweedie_repr__"):
+            return self.__tweedie_repr__()
+        if safe:
+            raise ValueError("This distribution has no Tweedie representation.")
+        return None
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""Compute the unit variance function.
+        r"""Compute the unit variance.
 
-        The unit variance :math:`v(\mu)` determines the variance as a function
-        of the mean :math:`\mu` by
-        :math:`\mathrm{var}(y_i) = (\phi / s_i) \times v(\mu_i)`. It can
-        also be derived from the unit deviance :math:`d(y, \mu)` as
+        The unit variance, :math:`v(\mu) \equiv b''((b')^{-1} (\mu))`,
+        determines the variance as a function of the mean :math:`\mu` by
+        :math:`\mathrm{var}(y_i) = v(\mu_i) \times \phi / w_i`. It can also be
+        derived from the unit deviance :math:`d(y, \mu)` as
 
         .. math::
 
-            v(\mu) = \frac{2}{\frac{\partial^2 d(y, \mu)}{\partial\mu^2}}\big|_{y=\mu}.
+            v(\mu) = 2 \div \frac{\partial^2 d(y, \mu)}{\partial\mu^2} \big| _{y=\mu}.
 
-        See also :func:`variance`.
+        See also :meth:`~ExponentialDispersionModel.variance`.
 
         Parameters
         ----------
@@ -161,8 +133,6 @@ def unit_variance(self, mu):
     def unit_variance_derivative(self, mu):
         r"""Compute the derivative of the unit variance with respect to ``mu``.
 
-        Return :math:`v'(\mu)`.
-
         Parameters
         ----------
         mu : array-like, shape (n_samples,)
@@ -170,12 +140,12 @@ def unit_variance_derivative(self, mu):
         """
         pass
 
-    def variance(self, mu: np.ndarray, dispersion=1, sample_weight=1) -> np.ndarray:
+    def variance(self, mu, dispersion=1, sample_weight=1) -> np.ndarray:
         r"""Compute the variance function.
 
-        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i, \phi / s_i)` is
-        :math:`\mathrm{var}(Y_i) = (\phi / s_i) * v(\mu_i)`, with unit variance
-        :math:`v(\mu)` and weights :math:`s_i`.
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i, \phi / w_i)` takes
+        the form :math:`v(\mu_i) \times \phi / w_i`, where :math:`v(\mu)` is the
+        unit variance and :math:`w_i` are weights.
 
         Parameters
         ----------
@@ -198,8 +168,8 @@ def variance_derivative(self, mu, dispersion=1, sample_weight=1):
         r"""Compute the derivative of the variance with respect to ``mu``.
 
         The derivative of the variance is equal to
-        :math:`(\phi / s_i) * v'(\mu_i)`, where :math:`v(\mu)` is the unit
-        variance and :math:`s_i` are weights.
+        :math:`v(\mu_i) \times \phi / w_i`, where :math:`v(\mu)` is the unit
+        variance and :math:`ws_i` are weights.
 
         Parameters
         ----------
@@ -222,8 +192,10 @@ def variance_derivative(self, mu, dispersion=1, sample_weight=1):
     def unit_deviance(self, y, mu):
         r"""Compute the unit deviance.
 
-        In terms of the log likelihood :math:`L`, the unit deviance is
-        :math:`-2\phi\times [L(y, \mu, \phi) - L(y, y, \phi)].`
+        In terms of the unit log likelihood :math:`\ell`, the unit deviance is
+        :math:`2 [\ell(y_i, y_i, \phi) - \ell(y_i, \mu, \phi)]`, i.e. twice the
+        difference between the log likelihood of a saturated model (with one
+        parameter per observation) and the model at hand.
 
         Parameters
         ----------
@@ -239,7 +211,7 @@ def unit_deviance_derivative(self, y, mu):
         r"""Compute the derivative of the unit deviance with respect to ``mu``.
 
         The derivative of the unit deviance is given by
-        :math:`-2 \times (y - \mu) / v(\mu)`, where :math:`v(\mu)` is the unit
+        :math:`2 \times (\mu - y) / v(\mu)`, where :math:`v(\mu)` is the unit
         variance.
 
         Parameters
@@ -256,13 +228,14 @@ def unit_deviance_derivative(self, y, mu):
         """
         return -2 * (y - mu) / self.unit_variance(mu)
 
-    def deviance(self, y, mu, sample_weight=1):
+    def deviance(self, y, mu, sample_weight=1) -> float:
         r"""Compute the deviance.
 
-        The deviance is a weighted sum of the unit deviances,
-        :math:`\sum_i s_i \times d(y_i, \mu_i)`, where :math:`d(y, \mu)` is the
-        unit deviance and :math:`s` are weights. In terms of the log likelihood,
-        it is :math:`-2\phi \times [L(y, \mu, \phi / s) - L(y, y, \phi / s)]`.
+        The deviance is a weighted sum of the unit deviances. In terms of the
+        unit log likelihood :math:`\ell`, it equals
+        :math:`2 \sum_i [\ell(y_i, y_i, \phi) - \ell(y_i, \mu, \phi)]`,
+        i.e. twice the difference between the log likelihood of a saturated
+        model (with one parameter per observation) and the model at hand.
 
         Parameters
         ----------
@@ -273,11 +246,7 @@ def deviance(self, y, mu, sample_weight=1):
             Predicted mean.
 
         sample_weight : array-like, shape (n_samples,), optional (default=1)
-            Weights or exposure to which variance is inversely proportional.
-
-        Returns
-        -------
-        float
+            Weights or exposure to which the variance is inversely proportional.
         """
         if sample_weight is None:
             return np.sum(self.unit_deviance(y, mu))
@@ -306,46 +275,41 @@ def deviance_derivative(self, y, mu, sample_weight=1):
 
     def _mu_deviance_derivative(
         self,
-        coef: np.ndarray,
-        X,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
+        coef,
+        X: Union[MatrixBase, StandardizedMatrix],
+        y,
+        sample_weight,
         link: Link,
-        offset: np.ndarray = None,
-    ) -> tuple[np.ndarray, np.ndarray]:
-        """Compute ``mu`` and the derivative of the deviance \
-            with respect to coefficients."""
+        offset=None,
+    ):
+        """Compute ``mu`` and the derivative of the deviance with respect to coefficients."""
         lin_pred = _safe_lin_pred(X, coef, offset)
         mu = link.inverse(lin_pred)
         d1 = link.inverse_derivative(lin_pred)
         temp = d1 * self.deviance_derivative(y, mu, sample_weight)
+
         if coef.size == X.shape[1] + 1:
             devp = np.concatenate(([temp.sum()], temp @ X))
         else:
             devp = temp @ X  # same as X.T @ temp
+
         return mu, devp
 
     def eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-    ):
-        """
-        Compute ``eta``, ``mu`` and the deviance.
-
-        Compute:
-        * the linear predictor, ``eta``, as ``cur_eta + factor * X_dot_d``;
-        * the link-function-transformed prediction, ``mu``;
-        * the deviance.
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+    ) -> tuple[np.ndarray, np.ndarray, float]:
+        """Compute ``eta``, ``mu`` and the deviance.
 
         Returns
         -------
         numpy.ndarray, shape (X.shape[0],)
-            The linear predictor, ``eta``.
+            The linear predictor, ``eta``, as ``cur_eta + factor * X_dot_d``.
         numpy.ndarray, shape (X.shape[0],)
             The link-function-transformed prediction, ``mu``.
         float
@@ -355,24 +319,25 @@ def eta_mu_deviance(
         # avoiding allocating new arrays for every line search loop
         eta_out = np.empty_like(cur_eta)
         mu_out = np.empty_like(cur_eta)
+
         deviance = self._eta_mu_deviance(
             link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
         )
+
         return eta_out, mu_out, deviance
 
     def _eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta_out: np.ndarray,
-        mu_out: np.ndarray,
-    ):
-        """
-        Update ``eta`` and ``mu`` and compute the deviance.
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ) -> float:
+        """Update ``eta`` and ``mu`` and compute the deviance.
 
         This is a default implementation that should work for all valid
         distributions and link functions. To implement a custom optimized
@@ -382,6 +347,7 @@ def _eta_mu_deviance(
         Returns
         -------
         float
+            The deviance.
         """
         eta_out[:] = cur_eta + factor * X_dot_d
         mu_out[:] = link.inverse(eta_out)
@@ -390,17 +356,16 @@ def _eta_mu_deviance(
     def rowwise_gradient_hessian(
         self,
         link: Link,
-        coef: np.ndarray,
+        coef,
         dispersion,
         X: Union[MatrixBase, StandardizedMatrix],
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta: np.ndarray,
-        mu: np.ndarray,
-        offset: np.ndarray = None,
+        y,
+        sample_weight,
+        eta,
+        mu,
+        offset=None,
     ):
-        """
-        Compute the gradient and negative Hessian of the log likelihood row-wise.
+        """Compute the gradient and negative Hessian of the log likelihood row-wise.
 
         Returns
         -------
@@ -411,6 +376,7 @@ def rowwise_gradient_hessian(
         """
         gradient_rows = np.empty_like(mu)
         hessian_rows = np.empty_like(mu)
+
         self._rowwise_gradient_hessian(
             link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
         )
@@ -422,8 +388,7 @@ def rowwise_gradient_hessian(
     def _rowwise_gradient_hessian(
         self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
     ):
-        """
-        Update ``gradient_rows`` and ``hessian_rows`` in place.
+        """Update ``gradient_rows`` and ``hessian_rows`` in place.
 
         This is a default implementation that should work for all valid
         distributions and link functions. To implement a custom optimized
@@ -551,10 +516,6 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
 
         method = {'pearson', 'deviance'}, optional (default='pearson')
             Whether to base the estimate on the Pearson residuals or the deviance.
-
-        Returns
-        -------
-        float
         """
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
@@ -580,24 +541,27 @@ class TweedieDistribution(ExponentialDispersionModel):
 
     A Tweedie distribution with mean :math:`\mu = \mathrm{E}(Y)` is uniquely
     defined by its mean-variance relationship
-    :math:`\mathrm{var}(Y) \propto \mu^{\mathrm{power}}`.
+    :math:`\mathrm{var}(Y) \propto \mu^{\mathrm{p}}`.
 
     Special cases are:
 
-    ====== ================
-    Power  Distribution
-    ====== ================
-    0      Normal
-    1      Poisson
-    (1, 2) Compound Poisson
-    2      Gamma
-    3      Inverse Gaussian
-    ====== ================
+    ====== ================ ============
+    Power  Distribution     Support
+    ====== ================ ============
+    0      Normal           ``(-∞, +∞)``
+    1      Poisson          ``[0, +∞)``
+    (1, 2) Compound Poisson ``[0, +∞)``
+    2      Gamma            ``(0, +∞)``
+    3      Inverse Gaussian ``(0, +∞)``
+    ====== ================ ============
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
 
     Parameters
     ----------
     power : float, optional (default=0)
-        The variance power of the `unit_variance`
+        The variance power of the ``unit_variance``
         :math:`v(\mu) = \mu^{\mathrm{power}}`. For
         :math:`0 < \mathrm{power} < 1`, no distribution exists.
     """
@@ -612,9 +576,11 @@ def __init__(self, power=0):
     def __eq__(self, other):  # noqa D
         return isinstance(other, TweedieDistribution) and (self.power == other.power)
 
+    def __tweedie_repr__(self):  # noqa D
+        return self.__class__(self.power)
+
     @property
-    def lower_bound(self) -> float:
-        """Return the lowest value of ``y`` allowed."""
+    def lower_bound(self) -> float:  # noqa D
         if self.power <= 0:
             return -np.inf
         if self.power >= 1:
@@ -622,8 +588,7 @@ def lower_bound(self) -> float:
         raise ValueError
 
     @property
-    def include_lower_bound(self) -> bool:
-        """Return whether ``lower_bound`` is allowed as a value of ``y``."""
+    def include_lower_bound(self) -> bool:  # noqa D
         if self.power <= 0:
             return False
         if (self.power >= 1) and (self.power < 2):
@@ -639,97 +604,66 @@ def power(self) -> float:
 
     @power.setter
     def power(self, power):
-        if not isinstance(power, (int, float)):
-            raise TypeError(f"power must be an int or float, input was {power}")
+
+        if not isinstance(power, (int, float, np.number)):
+            raise TypeError(f"The power parameter must be numeric; got {power}.")
         if (power > 0) and (power < 1):
-            raise ValueError("For 0<power<1, no distribution exists.")
+            raise ValueError("For `0 < p < 1`, no distribution exists.")
 
         # Prevents upcasting when working with 32-bit data
         self._power = power if isinstance(power, int) else np.float32(power)
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Compute the unit variance of a Tweedie distribution ``v(mu) = mu^power``.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance(self, mu):  # noqa D
         p = self.power  # noqa: F841
         return numexpr.evaluate("mu ** p")
 
-    def unit_variance_derivative(self, mu: np.ndarray) -> np.ndarray:
-        r"""Compute the derivative of the unit variance of a Tweedie distribution.
-
-        Equation: :math:`v(\mu) = p \times \mu^{(p-1)}`.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance_derivative(self, mu):  # noqa D
         p = self.power  # noqa: F841
         return numexpr.evaluate("p * mu ** (p - 1)")
 
-    def deviance(self, y, mu, sample_weight=None) -> float:
-        """Compute the deviance.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
 
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=1)
-            Sample weights.
-        """
-        p = self.power
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
         # NOTE: the dispersion parameter is only necessary to convey
         # type information on account of a bug in Cython
 
-        if p == 0:
+        if self.power == 0:
             return normal_deviance(y, sample_weight, mu, dispersion=1.0)
-        if p == 1:
+        if self.power == 1:
             return poisson_deviance(y, sample_weight, mu, dispersion=1.0)
-        elif p == 2:
+        elif self.power == 2:
             return gamma_deviance(y, sample_weight, mu, dispersion=1.0)
+        elif self.power == 3:
+            return inv_gaussian_deviance(y, sample_weight, mu, dispersion=1.0)
         else:
-            return tweedie_deviance(y, sample_weight, mu, p=float(p))
+            return tweedie_deviance(y, sample_weight, mu, p=float(self.power))
 
-    def unit_deviance(self, y, mu):
-        """Get the deviance of each observation."""
-        p = self.power
-        if p == 0:  # Normal distribution
+    def unit_deviance(self, y, mu):  # noqa D
+
+        if self.power == 0:  # normal distribution
             return (y - mu) ** 2
-        if p == 1:  # Poisson distribution
+        if self.power == 1:  # Poisson distribution
             return 2 * (special.xlogy(y, y / mu) - y + mu)
-        elif p == 2:  # Gamma distribution
+        elif self.power == 2:  # Gamma distribution
             return 2 * (np.log(mu / y) + y / mu - 1)
+        elif self.power == 3:  # inverse Gaussian distribution
+            return ((y / mu - 1) ** 2) / y
         else:
-            mu1mp = mu ** (1 - p)
+            mu1mp = mu ** (1 - self.power)
             return 2 * (
-                (np.maximum(y, 0) ** (2 - p)) / ((1 - p) * (2 - p))
-                - y * mu1mp / (1 - p)
-                + mu * mu1mp / (2 - p)
+                (np.maximum(y, 0) ** (2 - self.power))
+                / ((1 - self.power) * (2 - self.power))
+                - y * mu1mp / (1 - self.power)
+                + mu * mu1mp / (2 - self.power)
             )
 
     def _rowwise_gradient_hessian(
         self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
     ):
         f = None
+
         if self.power == 0 and isinstance(link, IdentityLink):
             f = normal_identity_rowwise_gradient_hessian
         elif self.power == 1 and isinstance(link, LogLink):
@@ -738,6 +672,8 @@ def _rowwise_gradient_hessian(
             f = gamma_log_rowwise_gradient_hessian
         elif 1 < self.power < 2 and isinstance(link, LogLink):
             f = partial(tweedie_log_rowwise_gradient_hessian, p=self.power)
+        elif self.power == 3:
+            f = partial(inv_gaussian_log_rowwise_gradient_hessian, p=self.power)
 
         if f is not None:
             return f(y, sample_weight, eta, mu, gradient_rows, hessian_rows)
@@ -758,6 +694,7 @@ def _eta_mu_deviance(
         mu_out: np.ndarray,
     ):
         f = None
+
         if self.power == 0 and isinstance(link, IdentityLink):
             f = normal_identity_eta_mu_deviance
         elif self.power == 1 and isinstance(link, LogLink):
@@ -766,6 +703,8 @@ def _eta_mu_deviance(
             f = gamma_log_eta_mu_deviance
         elif 1 < self.power < 2 and isinstance(link, LogLink):
             f = partial(tweedie_log_eta_mu_deviance, p=self.power)
+        elif self.power == 3 and isinstance(link, LogLink):
+            f = partial(inv_gaussian_log_eta_mu_deviance, p=self.power)
 
         if f is not None:
             return f(cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor)
@@ -777,7 +716,7 @@ def _eta_mu_deviance(
     def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
         r"""Compute the log likelihood.
 
-        For ``1 < power < 2``, we use the series approximation by Dunn and Smyth
+        For ``1 < p < 2``, we use the series approximation by Dunn and Smyth
         (2005) to compute the normalization term.
 
         Parameters
@@ -813,33 +752,14 @@ def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
             return tweedie_log_likelihood(
                 y, sample_weight, mu, float(p), float(dispersion)
             )
+        elif p == 3:
+            return inv_gaussian_log_likelihood(y, sample_weight, mu, float(dispersion))
         else:
             raise NotImplementedError
 
-    def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
-        r"""Estimate the dispersion parameter :math:`\phi`.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Weights or exposure to which variance is inversely proportional.
-
-        ddof : int, optional (default=1)
-            Degrees of freedom consumed by the model for ``mu``.
-
-        method = {'pearson', 'deviance'}, optional (default='pearson')
-            Whether to base the estimate on the Pearson residuals or the deviance.
-
-        Returns
-        -------
-        float
-        """
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
         p = self.power  # noqa: F841
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
@@ -856,163 +776,167 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
         )
 
 
-class NormalDistribution(TweedieDistribution):
-    """Class for the Normal (a.k.a. Gaussian) distribution."""
+class NormalDistribution(ExponentialDispersionModel):
+    """Class for the normal (a.k.a. Gaussian) distribution.
 
-    def __init__(self):
-        super().__init__(power=0)
+    The normal distribution models outcomes ``y`` in ``(-∞, +∞)``.
 
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
 
-class PoissonDistribution(TweedieDistribution):
-    """Class for the scaled Poisson distribution."""
+    lower_bound = -np.inf
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
 
-    def __init__(self):
-        super().__init__(power=1)
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
 
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(0)
 
-class GammaDistribution(TweedieDistribution):
-    """Class for the Gamma distribution."""
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return 1 if np.isscalar(mu) else np.ones_like(mu)
 
-    def __init__(self):
-        super().__init__(power=2)
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 0 if np.isscalar(mu) else np.zeros_like(mu)
 
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
 
-class InverseGaussianDistribution(TweedieDistribution):
-    """Class for the scaled Inverse Gaussian distribution."""
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-    def __init__(self):
-        super().__init__(power=3)
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
 
+        return normal_deviance(y, sample_weight, mu, dispersion=1.0)
 
-class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
-    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+    def unit_deviance(self, y, mu):  # noqa D
+        return (y - mu) ** 2
 
-    The GHS distribution is for targets ``y`` in ``(-∞, +∞)``.
-    """
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        if isinstance(link, IdentityLink):
+            return normal_identity_rowwise_gradient_hessian(
+                y, sample_weight, eta, mu, gradient_rows, hessian_rows
+            )
 
-    lower_bound = -np.inf
-    upper_bound = np.inf
-    include_lower_bound = False
-    include_upper_bound = False
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
 
-    def __eq__(self, other):  # noqa D
-        return isinstance(other, self.__class__)
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, IdentityLink):
+            return normal_identity_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
+            )
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level expected variance.
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
 
-        See superclass documentation.
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
 
         Parameters
         ----------
-        mu : array-like or float
+        y : array-like, shape (n_samples,)
+            Target values.
 
-        Returns
-        -------
-        array-like
-        """
-        return 1 + mu**2
+        mu : array-like, shape (n_samples,)
+            Predicted mean.
 
-    def unit_variance_derivative(self, mu: np.ndarray) -> np.ndarray:
-        """Get the derivative of the unit variance.
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
 
-        See superclass documentation.
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
+        """
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-        Parameters
-        ----------
-        mu : array-like or float
+        if dispersion is None:
+            dispersion = self.dispersion(y, mu, sample_weight)
 
-        Returns
-        -------
-        array-like
-        """
-        return 2 * mu
+        return normal_log_likelihood(y, sample_weight, mu, float(dispersion))
 
-    def unit_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level deviance.
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
-        See superclass documentation.
+        if method == "pearson":
+            formula = "(y - mu) ** 2"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
 
-        Parameters
-        ----------
-        y : array-like
-        mu : array-like
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
 
-        Returns
-        -------
-        array-like
-        """
-        return 2 * y * (np.arctan(y) - np.arctan(mu)) + np.log((1 + mu**2) / (1 + y**2))
 
+class PoissonDistribution(ExponentialDispersionModel):
+    """Class for the scaled Poisson distribution.
 
-class BinomialDistribution(ExponentialDispersionModel):
-    """A class for the Binomial distribution.
+    The Poisson distribution models discrete outcomes ``y`` in ``[0, +∞)``.
 
-    The Binomial distribution is for targets ``y`` in ``[0, 1]``.
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
     """
 
     lower_bound = 0
-    upper_bound = 1
+    upper_bound = np.inf
     include_lower_bound = True
-    include_upper_bound = True
+    include_upper_bound = False
 
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level expected variance.
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(1)
 
-        See superclass documentation.
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return mu
 
-        Parameters
-        ----------
-        mu : array-like
-
-        Returns
-        -------
-        array-like
-        """
-        return mu * (1 - mu)
-
-    def unit_variance_derivative(self, mu):
-        """Get the derivative of the unit variance.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu : array-like or float
-
-        Returns
-        -------
-        array-like
-        """
-        return 1 - 2 * mu
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 1.0 if np.isscalar(mu) else np.ones_like(mu)
 
-    def unit_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level deviance.
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-        See superclass documentation.
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
 
-        Parameters
-        ----------
-        y : array-like
-        mu : array-like
+        return poisson_deviance(y, sample_weight, mu, dispersion=1.0)
 
-        Returns
-        -------
-        array-like
-        """
-        # see Wooldridge and Papke (1996) for the fractional case
-        return -2 * (special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu))
+    def unit_deviance(self, y, mu):
+        """Compute the unit deviance."""
+        return 2 * (special.xlogy(y, y / mu) - y + mu)
 
     def _rowwise_gradient_hessian(
         self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
     ):
-        if isinstance(link, LogitLink):
-            return binomial_logit_rowwise_gradient_hessian(
+        if isinstance(link, LogLink):
+            return poisson_log_rowwise_gradient_hessian(
                 y, sample_weight, eta, mu, gradient_rows, hessian_rows
             )
+
         return super()._rowwise_gradient_hessian(
             link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
         )
@@ -1021,23 +945,24 @@ def _eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta_out: np.ndarray,
-        mu_out: np.ndarray,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
     ):
-        if isinstance(link, LogitLink):
-            return binomial_logit_eta_mu_deviance(
+        if isinstance(link, LogLink):
+            return poisson_log_eta_mu_deviance(
                 cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
             )
+
         return super()._eta_mu_deviance(
             link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
         )
 
-    def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
-        """Compute the log likelihood.
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
 
         Parameters
         ----------
@@ -1050,14 +975,108 @@ def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
         sample_weight : array-like, shape (n_samples,), optional (default=1)
             Sample weights.
 
-        dispersion : float, optional (default=1)
-            Ignored.
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
         """
-        ll = special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu)
-        return np.sum(ll) if sample_weight is None else np.dot(ll, sample_weight)
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-    def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
-        r"""Estimate the dispersion parameter :math:`\phi`.
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
+
+        return poisson_log_likelihood(y, sample_weight, mu, 1.0)
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+
+        if method == "pearson":
+            formula = "((y - mu) ** 2) / mu"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
+
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
+
+
+class GammaDistribution(ExponentialDispersionModel):
+    """Class for the gamma distribution.
+
+    The gamma distribution models outcomes ``y`` in ``(0, +∞)``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = 0
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(2)
+
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return mu**2
+
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 2 * mu
+
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
+
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
+
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
+
+        return gamma_deviance(y, sample_weight, mu, dispersion=1.0)
+
+    def unit_deviance(self, y, mu):  # noqa D
+        return 2 * (np.log(mu / y) + y / mu - 1)
+
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        if isinstance(link, LogLink):
+            return gamma_log_rowwise_gradient_hessian(
+                y, sample_weight, eta, mu, gradient_rows, hessian_rows
+            )
+
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
+
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, LogLink):
+            return gamma_log_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
+            )
+
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
+
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
 
         Parameters
         ----------
@@ -1067,19 +1086,251 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
         mu : array-like, shape (n_samples,)
             Predicted mean.
 
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Weights or exposure to which variance is inversely proportional.
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
 
-        ddof : int, optional (default=1)
-            Degrees of freedom consumed by the model for ``mu``.
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
+        """
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-        method = {'pearson', 'deviance'}, optional (default='pearson')
-            Whether to base the estimate on the Pearson residuals or the deviance.
+        if dispersion is None:
+            dispersion = self.dispersion(y, mu, sample_weight)
 
-        Returns
-        -------
-        float
+        return gamma_log_likelihood(y, sample_weight, mu, float(dispersion))
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+
+        if method == "pearson":
+            formula = "((y - mu) ** 2) / (mu ** 2)"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
+
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
+
+
+class InverseGaussianDistribution(ExponentialDispersionModel):
+    """Class for the inverse Gaussian distribution.
+
+    The inverse Gaussian distribution models outcomes ``y`` in ``(0, +∞)``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = 0
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(3)
+
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return mu**3
+
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 3 * (mu**2)
+
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
+
+        return tweedie_deviance(y, sample_weight, mu, p=3.0)
+
+    def unit_deviance(self, y, mu):  # noqa D
+        return numexpr.evaluate("y / (mu**2) + 1 / y - 2 / mu")
+
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
+
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, LogLink):
+            return tweedie_log_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor, p=3.0
+            )
+
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
+
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
+
+        Parameters
+        ----------
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        mu : array-like, shape (n_samples,)
+            Predicted mean.
+
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
+
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
+        """
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
+
+        if dispersion is None:
+            dispersion = self.dispersion(y, mu, sample_weight)
+
+        return tweedie_log_likelihood(y, sample_weight, mu, 3.0, float(dispersion))
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+
+        if method == "pearson":
+            formula = "((y - mu) ** 2) / (mu ** 3)"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
+
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
+
+
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+
+    The GHS distribution models outcomes ``y`` in ``(-∞, +∞)``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = -np.inf
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 2 * mu
+
+    def unit_deviance(self, y, mu) -> np.ndarray:  # noqa D
+        return 2 * y * (np.arctan(y) - np.arctan(mu)) + np.log((1 + mu**2) / (1 + y**2))
+
+
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution models outcomes ``y`` in ``[0, 1]``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = 0
+    upper_bound = 1
+    include_lower_bound = True
+    include_upper_bound = True
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def unit_variance(self, mu):  # noqa D
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):  # noqa D
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):  # noqa D
+        # see Wooldridge and Papke (1996) for the fractional case
+        return -2 * (special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu))
+
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        if isinstance(link, LogitLink):
+            return binomial_logit_rowwise_gradient_hessian(
+                y, sample_weight, eta, mu, gradient_rows, hessian_rows
+            )
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
+
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, LogitLink):
+            return binomial_logit_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
+            )
+
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
+
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
+        """Compute the log likelihood.
+
+        Parameters
+        ----------
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        mu : array-like, shape (n_samples,)
+            Predicted mean.
+
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
+
+        dispersion : float, optional (default=1)
+            Ignored.
         """
+        ll = special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu)
+        return np.sum(ll) if sample_weight is None else np.dot(ll, sample_weight)
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
         if method == "pearson":
@@ -1098,14 +1349,14 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
 class NegativeBinomialDistribution(ExponentialDispersionModel):
     r"""A class for the Negative Binomial distribution.
 
-    A Negative Binomial distribution with mean :math:`\mu = \mathrm{E}(Y)` is uniquely
-    defined by its mean-variance relationship
+    A negative binomial distribution with mean :math:`\mu = \mathrm{E}(Y)` is
+    uniquely defined by its mean-variance relationship
     :math:`\mathrm{var}(Y) \propto \mu + \theta * \mu^2`.
 
     Parameters
     ----------
     theta : float, optional (default=1.0)
-        The dispersion parameter from `unit_variance`
+        The dispersion parameter from the ``unit_variance``
         :math:`v(\mu) = \mu + \theta * \mu^2`. For
         :math:`\theta <= 0`, no distribution exists.
 
@@ -1132,74 +1383,34 @@ def __eq__(self, other):  # noqa D
 
     @property
     def theta(self) -> float:
-        """Return the Negative Binomial theta parameter."""
+        """Return the negative binomial theta parameter."""
         return self._theta
 
     @theta.setter
     def theta(self, theta):
+
         if not isinstance(theta, (int, float)):
-            raise TypeError(f"theta must be an int or float, input was {theta}")
+            raise TypeError(f"Theta must be numeric; got {theta}.")
         if not theta > 0:
-            raise ValueError(
-                f"theta must be strictly positive number, input was {theta}"
-            )
+            raise ValueError(f"Theta must be strictly positive; got was {theta}.")
 
         # Prevents upcasting when working with 32-bit data
-        self._theta = np.float32(theta)
+        self._theta = theta if isinstance(theta, int) else np.float32(theta)
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Compute the unit variance of a Negative Binomial distribution
-        ``v(mu) = mu + theta * mu^2``.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance(self, mu):  # noqa D
         return mu + self.theta * mu**2
 
-    def unit_variance_derivative(self, mu: np.ndarray) -> np.ndarray:
-        r"""Compute the derivative of the unit variance of a Negative Binomial distribution.
-
-        Equation: :math:`v(\mu) = 1 + 2 \times \theta \times \mu`.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance_derivative(self, mu):  # noqa D
         return 1 + 2 * self.theta * mu
 
-    def deviance(self, y, mu, sample_weight=None) -> float:
-        """Compute the deviance.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=1)
-            Sample weights.
-        """
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
         theta = self.theta
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
         return negative_binomial_deviance(y, sample_weight, mu, theta=float(theta))
 
-    def unit_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray:
-        """Get the deviance of each observation."""
+    def unit_deviance(self, y, mu):  # noqa D
         theta = self.theta
 
         r = 1.0 / theta
@@ -1221,12 +1432,12 @@ def _eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta_out: np.ndarray,
-        mu_out: np.ndarray,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
     ):
         if isinstance(link, LogLink):
             return negative_binomial_log_eta_mu_deviance(
@@ -1266,30 +1477,9 @@ def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
 
         return negative_binomial_log_likelihood(y, sample_weight, mu, float(theta), 1.0)
 
-    def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
-        r"""Estimate the dispersion parameter :math:`\phi`.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Weights or exposure to which variance is inversely proportional.
-
-        ddof : int, optional (default=1)
-            Degrees of freedom consumed by the model for ``mu``.
-
-        method = {'pearson', 'deviance'}, optional (default='pearson')
-            Whether to base the estimate on the Pearson residuals or the deviance.
-
-        Returns
-        -------
-        float
-        """
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
         theta = self.theta  # noqa: F841
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
@@ -1307,8 +1497,8 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
 
 
 def guess_intercept(
-    y: np.ndarray,
-    sample_weight: np.ndarray,
+    y,
+    sample_weight,
     link: Link,
     distribution: ExponentialDispersionModel,
     eta: Union[np.ndarray, float] = None,
@@ -1327,51 +1517,77 @@ def guess_intercept(
     avg_y = np.average(y, weights=sample_weight)
 
     if isinstance(link, IdentityLink):
-        # This is only correct for normal. For other distributions, answer is unknown,
-        # but assume that we want sum(y) = sum(mu)
+
+        # This is only correct for the normal. For other distributions, the
+        # answer is unknown, but we assume that we want `sum(y) = sum(mu)`
+
         if eta is None:
             return avg_y
+
         avg_eta = eta if np.isscalar(eta) else np.average(eta, weights=sample_weight)
+
         return avg_y - avg_eta
+
     elif isinstance(link, LogLink):
+
         # This is only correct for Tweedie
+
         log_avg_y = np.log(avg_y)
+
         assert np.isfinite(log_avg_y).all()
 
         if eta is None:
             return log_avg_y
+
         mu = np.exp(eta)
+
         if isinstance(distribution, TweedieDistribution):
             p = distribution.power
+        elif isinstance(distribution, NormalDistribution):
+            p = 0
+        elif isinstance(distribution, PoissonDistribution):
+            p = 1
+        elif isinstance(distribution, GammaDistribution):
+            p = 2
+        elif isinstance(distribution, InverseGaussianDistribution):
+            p = 3
         else:
             p = 1  # Like Poisson
+
         if np.isscalar(mu):
             first = np.log(y.dot(sample_weight) * mu ** (1 - p))
             second = np.log(sample_weight.sum() * mu ** (2 - p))
         else:
             first = np.log((y * mu ** (1 - p)).dot(sample_weight))
             second = np.log((mu ** (2 - p)).dot(sample_weight))
+
         return first - second
+
     elif isinstance(link, LogitLink):
+
         log_odds = np.log(avg_y) - np.log(1 - avg_y)
+
         if eta is None:
             return log_odds
+
         avg_eta = eta if np.isscalar(eta) else np.average(eta, weights=sample_weight)
+
         return log_odds - avg_eta
+
     else:
+
         return link.link(y.dot(sample_weight))
 
 
 def get_one_over_variance(
     distribution: ExponentialDispersionModel,
     link: Link,
-    mu: np.ndarray,
-    eta: np.ndarray,
-    dispersion,
-    sample_weight: np.ndarray,
+    mu,
+    eta,
+    dispersion: float,
+    sample_weight,
 ):
-    """
-    Get one over the variance.
+    """Get one over the variance.
 
     For Tweedie: ``sigma_inv = sample_weight / (mu ** p)`` during optimization,
     because ``phi = 1``.
diff --git a/src/glum/_functions.pyx b/src/glum/_functions.pyx
index 9d805fc9..6c19a2b0 100644
--- a/src/glum/_functions.pyx
+++ b/src/glum/_functions.pyx
@@ -41,9 +41,6 @@ def normal_identity_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = eta_out[i]
-        # Note: deviance is equal to -2 times the true log likelihood to match
-        # the default calculation using unit_deviance in _distribution.py
-        # True log likelihood: -1/2 * (y - mu)**2
         deviance += weights[i] * (y[i] - mu_out[i]) ** 2
     return deviance
 
@@ -114,7 +111,6 @@ def poisson_log_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = exp(eta_out[i])
-        # True log likelihood: y * eta - mu - lgamma(1 + y)
         deviance += weights[i] * (y[i] * eta_out[i] - mu_out[i])
     return -2 * deviance
 
@@ -183,7 +179,6 @@ def gamma_log_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = exp(eta_out[i])
-        # True log likelihood: -(y / mu + eta)
         deviance += weights[i] * (y[i] / mu_out[i] + eta_out[i])
     return 2 * deviance
 
@@ -238,6 +233,92 @@ def gamma_deviance(
 
     return 2 * D
 
+def inv_gaussian_log_eta_mu_deviance(
+    const_floating1d cur_eta,
+    const_floating1d X_dot_d,
+    const_floating1d y,
+    const_floating1d weights,
+    floating[:] eta_out,
+    floating[:] mu_out,
+    floating factor
+):
+    cdef int n = cur_eta.shape[0]
+    cdef int i  # loop counter
+    cdef floating sq_err  # helper
+    cdef floating deviance = 0.0  # output
+
+    for i in prange(n, nogil=True):
+
+        eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
+        mu_out[i] = exp(eta_out[i])
+
+        sq_err = (y[i] / mu_out[i] - 1) ** 2
+
+        deviance += weights[i] * sq_err / y[i]
+
+    return deviance
+
+def inv_gaussian_log_rowwise_gradient_hessian(
+    const_floating1d y,
+    const_floating1d weights,
+    const_floating1d eta,
+    const_floating1d mu,
+    floating[:] gradient_rows_out,
+    floating[:] hessian_rows_out
+):
+    cdef int n = eta.shape[0]
+    cdef int i  # loop counter
+
+    cdef floating inv_mu, inv_mu2
+
+    for i in prange(n, nogil=True):
+
+        inv_mu = 1 / mu[i]
+        inv_mu2 = inv_mu ** 2
+
+        gradient_rows_out[i] = 2 * weights[i] * (inv_mu - y[i] * inv_mu2)
+        hessian_rows_out[i] = 2 * weights[i] * (inv_mu - 2 * y[i] * inv_mu2)
+
+def inv_gaussian_log_likelihood(
+    const_floating1d y,
+    const_floating1d weights,
+    const_floating1d mu,
+    floating dispersion,
+):
+    cdef int n = y.shape[0]  # loop length
+    cdef int i  # loop counter
+    cdef floating sum_weights  # helper
+    cdef floating ll = 0.0  # output
+
+    cdef floating sq_err  # helper
+    cdef floating inv_dispersion = 1 / (2 * dispersion)  # helper
+
+    for i in prange(n, nogil=True):
+
+        sq_err = (y[i] / mu[i] - 1) ** 2
+
+        ll -= weights[i] * (inv_dispersion * sq_err / y[i] + log(y[i]) * 3 / 2)
+        sum_weights -= weights[i]
+
+    return ll + sum_weights * log(inv_dispersion / M_PI)
+
+def inv_gaussian_deviance(
+    const_floating1d y,
+    const_floating1d weights,
+    const_floating1d mu,
+    floating dispersion,
+):
+    cdef int i  # loop counter
+    cdef int n = y.shape[0]  # loop length
+    cdef floating sq_err  # helper
+    cdef floating D = 0.0  # output
+
+    for i in prange(n, nogil=True):
+        sq_err = (y[i] / mu[i] - 1) ** 2
+        D += weights[i] * sq_err / y[i]
+
+    return D
+
 def tweedie_log_eta_mu_deviance(
     const_floating1d cur_eta,
     const_floating1d X_dot_d,
@@ -451,7 +532,6 @@ def negative_binomial_log_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = exp(eta_out[i])
-        # True log likelihood: y * log(y / mu) - (y + r) * log((y + r) / (mu + r))
         deviance += weights[i] * (-y[i] * eta_out[i] + (y[i] + r) * log(mu_out[i] + r))
     return 2 * deviance
 
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 67eb8332..33f7ba51 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -494,23 +494,19 @@ def get_family(
 
 def get_link(link: Union[str, Link], family: ExponentialDispersionModel) -> Link:
     """
-    For the Tweedie distribution, this code follows actuarial best practices regarding
-    link functions. Note that these links are sometimes not canonical:
-        - identity for normal (``p=0``);
+    For the Tweedie distribution, this code follows actuarial best practices
+    regarding link functions. Note that these links are sometimes not canonical:
+        - identity for normal (``p = 0``);
         - no convention for ``p < 0``, so let's leave it as identity;
         - log otherwise.
     """
     if isinstance(link, Link):
         return link
-    if link == "auto":
-        if isinstance(family, TweedieDistribution):
-            if family.power <= 0:
+
+    if (link is None) or (link == "auto"):
+        if tweedie_representation := family.to_tweedie(safe=False):
+            if tweedie_representation.power <= 0:
                 return IdentityLink()
-            if family.power < 1:
-                raise ValueError(
-                    "For 0 < p < 1, no Tweedie distribution exists. "
-                    "Please choose a different distribution."
-                )
             return LogLink()
         if isinstance(family, GeneralizedHyperbolicSecant):
             return IdentityLink()
@@ -523,16 +519,20 @@ def get_link(link: Union[str, Link], family: ExponentialDispersionModel) -> Link
             "Please set link manually, i.e. not to 'auto'. "
             f"Got (link='auto', family={family.__class__.__name__})."
         )
-    if link == "identity":
-        return IdentityLink()
-    if link == "log":
-        return LogLink()
-    if link == "logit":
-        return LogitLink()
-    if link == "cloglog":
-        return CloglogLink()
-    if link[:7] == "tweedie":
-        return TweedieLink(float(link[7:]))
+
+    mapping = {
+        "cloglog": CloglogLink(),
+        "identity": IdentityLink(),
+        "log": LogLink(),
+        "logit": LogitLink(),
+        "tweedie": TweedieLink(1.5),
+    }
+
+    if link in mapping:
+        return mapping[link]
+    if custom_tweedie := re.search(r"tweedie\s?\((.+)\)", link):
+        return TweedieLink(float(custom_tweedie.group(1)))
+
     raise ValueError(
         "The link must be an instance of class Link or an element of "
         "['auto', 'identity', 'log', 'logit', 'cloglog', 'tweedie']; "
@@ -3669,25 +3669,12 @@ def _compute_information_criteria(
         [3] Park, M.Y., 2006. Generalized linear models with regularization;
         Stanford Universty.
         """
-
-        # we require that the log_likelihood be defined
-        model_err_str = (
-            "The computation of the information criteria has only "
-            + "been defined for models with a Binomial likelihood, Negative "
-            + "Binomial likelihood or a Tweedie likelihood with power <= 2."
-        )
-        if not isinstance(
-            self.family_instance,
-            (BinomialDistribution, TweedieDistribution, NegativeBinomialDistribution),
-        ):
-            raise NotImplementedError(model_err_str)
-
-        # the log_likelihood has not been implemented for the InverseGaussianDistribution
-        if (
-            isinstance(self.family_instance, TweedieDistribution)
-            and self.family_instance.power > 2
-        ):
-            raise NotImplementedError(model_err_str)
+        if not hasattr(self.family_instance, "log_likelihood"):
+            raise NotImplementedError(
+                "The family instance does not define a `log_likelihood` method, so "
+                "information criteria cannot be computed. Compatible families include "
+                "the binomial, negative binomial and Tweedie (power<=2 or power=3)."
+            )
 
         ddof = np.sum(np.abs(self.coef_) > np.finfo(self.coef_.dtype).eps)
         k_params = ddof + self.fit_intercept
@@ -3695,9 +3682,8 @@ def _compute_information_criteria(
 
         if nobs != self._num_obs:
             raise ValueError(
-                "The same dataset that was used for training should "
-                + "also be used for the computation of information "
-                + "criteria"
+                "The same dataset that was used for training should also be used for "
+                "the computation of information criteria."
             )
 
         mu = self.predict(X, context=context)
@@ -3705,6 +3691,7 @@ def _compute_information_criteria(
 
         aic = -2 * ll + 2 * k_params
         bic = -2 * ll + np.log(nobs) * k_params
+
         if nobs > k_params + 1:
             aicc = aic + 2 * k_params * (k_params + 1) / (nobs - k_params - 1)
         else:
diff --git a/src/glum/_link.py b/src/glum/_link.py
index 8a19b731..179afac9 100644
--- a/src/glum/_link.py
+++ b/src/glum/_link.py
@@ -1,21 +1,20 @@
 import warnings
 from abc import ABCMeta, abstractmethod
+from typing import Callable
 
 import numpy as np
 from scipy import special
 
-from ._util import _asanyarray
-
 
 class Link(metaclass=ABCMeta):
-    """Abstract base class for Link functions."""
+    """Abstract base class for link functions."""
 
     @abstractmethod
     def link(self, mu):
-        """Compute the link function ``g(mu)``.
+        """Compute the link function.
 
-        The link function links the mean, ``mu ≡ E(Y)``, to the linear predictor
-        ``X * w``, i.e. ``g(mu)`` is equal to the linear predictor.
+        The link function ``g`` links the mean, ``mu ≡ E(Y)``, to the linear
+        predictor, ``X * w``, so that ``g(mu)`` is equal to the linear predictor.
 
         Parameters
         ----------
@@ -26,7 +25,7 @@ def link(self, mu):
 
     @abstractmethod
     def derivative(self, mu):
-        """Compute the derivative of the link ``g'(mu)``.
+        """Compute the derivative of the link function.
 
         Parameters
         ----------
@@ -37,11 +36,11 @@ def derivative(self, mu):
 
     @abstractmethod
     def inverse(self, lin_pred):
-        """Compute the inverse link function ``h(lin_pred)``.
+        """Compute the inverse link function.
 
-        Gives the inverse relationship between linear predictor,
-        ``lin_pred ≡ X * w``, and the mean, ``mu ≡ E(Y)``, i.e.
-        ``h(lin_pred) = mu``.
+        The inverse link function ``h`` gives the inverse relationship between
+        the linear predictor, ``X * w``, and the mean, ``mu ≡ E(Y)``, so that
+        ``h(X * w) = mu``.
 
         Parameters
         ----------
@@ -52,7 +51,7 @@ def inverse(self, lin_pred):
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        """Compute the derivative of the inverse link function.
 
         Parameters
         ----------
@@ -63,7 +62,7 @@ def inverse_derivative(self, lin_pred):
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
+        """Compute second derivative of the inverse link function.
 
         Parameters
         ----------
@@ -72,137 +71,154 @@ def inverse_derivative2(self, lin_pred):
         """
         pass
 
+    def to_tweedie(self, safe=True):
+        """Return the Tweedie representation of a link function if it exists."""
+        if hasattr(self, "__tweedie_repr__"):
+            return self.__tweedie_repr__()
+        if safe:
+            raise ValueError("This link function has no Tweedie representation.")
+        return None
 
-class IdentityLink(Link):
-    """The identity link function ``g(x) = x``."""
 
-    def __eq__(self, other):  # noqa D
-        return isinstance(other, self.__class__)
+def catch_p(fun) -> Callable:
+    """Ensure that linear predictors are compatible with the Tweedie power parameter."""
 
-    def link(self, mu):
-        """Return mu (identity link).
+    def _to_return(*args, **kwargs):
+        with np.errstate(invalid="raise"):
+            try:
+                result = fun(*args, **kwargs)
+            except FloatingPointError as e:
+                raise ValueError(
+                    "Your linear predictors are not supported for power "
+                    f"{args[0].power}. For negative linear predictors, consider using "
+                    "a log link instead."
+                ) from e
+        return result
 
-        See superclass documentation.
+    return _to_return
 
-        Parameters
-        ----------
-        mu: array-like
-        """
-        return _asanyarray(mu)
 
-    def derivative(self, mu):
-        """Get the derivative of the identity link, a vector of ones.
+class TweedieLink(Link):
+    """The Tweedie link function ``x^(1-p)`` if ``p≠1`` and ``log(x)`` if ``p=1``.
 
-        See superclass documentation.
+    See the documentation of the superclass, :class:`~glum.Link`, for details.
+    """
 
-        Parameters
-        ----------
-        mu: array-like
-        """
-        return 1.0 if np.isscalar(mu) else np.ones_like(mu)
+    def __init__(self, power):
+        self.power = power
 
-    def inverse(self, lin_pred):
-        """Compute the inverse link function ``h(lin_pred)``.
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__) and (self.power == other.power)
+
+    def __tweedie__repr__(self):  # noqa D
+        return self.__class__(self.power)
+
+    @property
+    def power(self) -> float:  # noqa D
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, (int, float, np.number)):
+            raise TypeError(f"The power parameter must be numeric; got {power}.")
+        if (power > 0) and (power < 1):
+            raise ValueError("For `0<p<1`, no distribution exists.")
+
+        # Prevents upcasting when working with 32-bit data
+        self._power = power if isinstance(power, int) else np.float32(power)
+
+    def link(self, mu):  # noqa D
+        if self.power == 0:
+            return mu
+        if self.power == 1:
+            return np.log(mu)
+        return mu ** (1 - self.power)
+
+    def derivative(self, mu):  # noqa D
+        if self.power == 0:
+            return 1.0 if np.isscalar(mu) else np.ones_like(mu)
+        if self.power == 1:
+            return 1 / mu
+        return (1 - self.power) * mu ** (-self.power)
 
-        Gives the inverse relationship between linear predictor and the mean
-        ``mu ≡ E(Y)``, i.e. ``h(linear predictor) = mu``.
+    @catch_p
+    def inverse(self, lin_pred):  # noqa D
+        if self.power == 0:
+            return lin_pred
+        if self.power == 1:
+            return np.exp(lin_pred)
+        return lin_pred ** (1 / (1 - self.power))
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return _asanyarray(lin_pred)
+    @catch_p
+    def inverse_derivative(self, lin_pred):  # noqa D
+        if self.power == 0:
+            return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
+        if self.power == 1:
+            return np.exp(lin_pred)
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        return (1 / (1 - self.power)) * lin_pred ** (self.power / (1 - self.power))
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
+    @catch_p
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        if self.power == 0:
+            return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
+        if self.power == 1:
+            return np.exp(lin_pred)
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
+        result = lin_pred ** ((2 * self.power - 1) / (1 - self.power))
+        result *= self.power / (1 - self.power) ** 2
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
+        return result
 
 
-class LogLink(Link):
-    """The log link function ``log(x)``."""
+class IdentityLink(Link):
+    """The identity link function."""
 
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def link(self, mu):
-        """Get the log of ``mu``.
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieLink(0)
 
-        See superclass documentation.
+    def link(self, mu):  # noqa D
+        return mu
 
-        Parameters
-        ----------
-        mu: array-like
+    def derivative(self, mu):  # noqa D
+        return 1 if np.isscalar(mu) else np.ones_like(mu)
 
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return np.log(_asanyarray(mu))
+    def inverse(self, lin_pred):  # noqa D
+        return lin_pred
 
-    def derivative(self, mu):
-        """Get the derivative of the log link, one over ``mu``.
+    def inverse_derivative(self, lin_pred):  # noqa D
+        return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
 
-        Parameters
-        ----------
-        mu: array-like
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
 
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return 1 / _asanyarray(mu)
 
-    def inverse(self, lin_pred):
-        """Get the inverse of the log link, the exponential function.
+class LogLink(Link):
+    """The log link function ``log(x)``."""
 
-        See superclass documentation.
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
 
-        Parameters
-        ----------
-        lin_pred: array-like
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieLink(1)
 
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return np.exp(_asanyarray(lin_pred))
+    def link(self, mu):  # noqa D
+        return np.log(mu)
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+    def derivative(self, mu):  # noqa D
+        return 1 / mu
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return np.exp(_asanyarray(lin_pred))
+    def inverse(self, lin_pred):  # noqa D
+        return np.exp(lin_pred)
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
+    def inverse_derivative(self, lin_pred):  # noqa D
+        return np.exp(lin_pred)
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return np.exp(_asanyarray(lin_pred))
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        return np.exp(lin_pred)
 
 
 class LogitLink(Link):
@@ -211,289 +227,59 @@ class LogitLink(Link):
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def link(self, mu):
-        """Get the logit function of ``mu``.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return special.logit(_asanyarray(mu))
+    def link(self, mu):  # noqa D
+        return special.logit(mu)
 
-    def derivative(self, mu):
-        """Get the derivative of the logit link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        mu = _asanyarray(mu)
+    def derivative(self, mu):  # noqa D
         return 1.0 / (mu * (1 - mu))
 
-    def inverse(self, lin_pred):
-        """Get the inverse of the logit link.
-
-        See superclass documentation.
+    def inverse(self, lin_pred):  # noqa D
 
-        Note: since passing a very large value might result in an output of one,
-        this function bounds the output to be between ``[50*eps, 1 - 50*eps]``,
-        where ``eps`` is floating point epsilon.
-
-        Parameters
-        ----------
-        lin_pred: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        inv_logit = special.expit(_asanyarray(lin_pred))
+        inv_logit = special.expit(lin_pred)
         eps50 = 50 * np.finfo(inv_logit.dtype).eps
+
         if np.any(inv_logit > 1 - eps50) or np.any(inv_logit < eps50):
-            warnings.warn(
-                "Computing sigmoid function gave results too close to 0 or 1. Clipping."
-            )
+            warnings.warn("Sigmoid function too close to 0 or 1. Clipping.")
             return np.clip(inv_logit, eps50, 1 - eps50)
-        return inv_logit
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        return inv_logit
 
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        ep = special.expit(_asanyarray(lin_pred))
+    def inverse_derivative(self, lin_pred):  # noqa D
+        ep = special.expit(lin_pred)
         return ep * (1.0 - ep)
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        ep = special.expit(_asanyarray(lin_pred))
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        ep = special.expit(lin_pred)
         return ep * (1.0 - ep) * (1.0 - 2 * ep)
 
 
-def catch_p(fun):
-    """
-    Decorate ``fun``, ensuring that the given linear predictor is compatible with the \
-    relevant Tweedie power parameter.
-
-    Parameters
-    ----------
-    fun: TweedieLink method
-
-    Returns
-    -------
-    Callable
-    """
-
-    def _to_return(*args, **kwargs):
-        with np.errstate(invalid="raise"):
-            try:
-                result = fun(*args, **kwargs)
-            except FloatingPointError as e:
-                raise ValueError(
-                    f"Your linear predictors are not supported for p={args[0].p}. For "
-                    + "negative linear predictors, consider using a log link instead."
-                ) from e
-        return result
-
-    return _to_return
-
-
-class TweedieLink(Link):
-    """The Tweedie link function ``x^(1-p)`` if ``p≠1`` and ``log(x)`` if ``p=1``."""
-
-    def __init__(self, p):
-        self.p = p
-
-    def __eq__(self, other):  # noqa D
-        return isinstance(other, self.__class__) and (self.p == other.p)
-
-    def link(self, mu):
-        """Get the Tweedie canonical link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-        """
-        if self.p == 0:
-            return _asanyarray(mu)
-        if self.p == 1:
-            return np.log(_asanyarray(mu))
-        return _asanyarray(mu) ** (1 - self.p)
-
-    def derivative(self, mu):
-        """Get the derivative of the Tweedie link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-        """
-        if self.p == 0:
-            return 1.0 if np.isscalar(mu) else np.ones_like(mu)
-        if self.p == 1:
-            return 1 / _asanyarray(mu)
-        return (1 - self.p) * _asanyarray(mu) ** (-self.p)
-
-    @catch_p
-    def inverse(self, lin_pred):
-        """Get the inverse of the Tweedie link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-        """
-        if self.p == 0:
-            return _asanyarray(lin_pred)
-        if self.p == 1:
-            return np.exp(_asanyarray(lin_pred))
-        return _asanyarray(lin_pred) ** (1 / (1 - self.p))
-
-    @catch_p
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse Tweedie link function ``h'(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        if self.p == 0:
-            return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
-        if self.p == 1:
-            return np.exp(_asanyarray(lin_pred))
-        return (1 / (1 - self.p)) * _asanyarray(lin_pred) ** (self.p / (1 - self.p))
-
-    @catch_p
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse Tweedie link function \
-            ``h''(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        if self.p == 0:
-            return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
-        if self.p == 1:
-            return np.exp(_asanyarray(lin_pred))
-
-        result = _asanyarray(lin_pred) ** ((2 * self.p - 1) / (1 - self.p))
-        result *= self.p / (1 - self.p) ** 2
-
-        return result
-
-
 class CloglogLink(Link):
     """The complementary log-log link function ``log(-log(-p))``."""
 
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def link(self, mu):
-        """Get the logit function of ``mu``.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        numpy.ndarray
-        """
-        mu = _asanyarray(mu)
+    def link(self, mu):  # noqa D
         return np.log(-np.log1p(-mu))
 
-    def derivative(self, mu):
-        """Get the derivative of the cloglog link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        mu = _asanyarray(mu)
+    def derivative(self, mu):  # noqa D
         return 1.0 / ((mu - 1) * (np.log1p(-mu)))
 
-    def inverse(self, lin_pred):
-        """Get the inverse of the cloglog link.
-
-        See superclass documentation.
-
-        Note: since passing a very large value might result in an output of one,
-        this function bounds the output to be between ``[50*eps, 1 - 50*eps]``,
-        where ``eps`` is floating point epsilon.
+    def inverse(self, lin_pred):  # noqa D
 
-        Parameters
-        ----------
-        lin_pred: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        lin_pred = _asanyarray(lin_pred)
+        lin_pred = lin_pred
         inv_cloglog = -np.expm1(-np.exp(lin_pred))
         eps50 = 50 * np.finfo(inv_cloglog.dtype).eps
+
         if np.any(inv_cloglog > 1 - eps50) or np.any(inv_cloglog < eps50):
-            warnings.warn(
-                "Computing sigmoid function gave results too close to 0 or 1. Clipping."
-            )
+            warnings.warn("Sigmoid function too close to 0 or 1. Clipping.")
             return np.clip(inv_cloglog, eps50, 1 - eps50)
-        return inv_cloglog
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        return inv_cloglog
 
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        lin_pred = _asanyarray(lin_pred)
+    def inverse_derivative(self, lin_pred):  # noqa D
         return np.exp(lin_pred - np.exp(lin_pred))
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        lin_pred = _asanyarray(lin_pred)
+    def inverse_derivative2(self, lin_pred):  # noqa D
         # TODO: check if numerical stability can be improved
         return np.exp(np.exp(lin_pred) - lin_pred) * np.expm1(lin_pred)
diff --git a/tests/glm/test_distribution.py b/tests/glm/test_distribution.py
index be2a694a..e58fd4c0 100644
--- a/tests/glm/test_distribution.py
+++ b/tests/glm/test_distribution.py
@@ -56,9 +56,9 @@ def test_family_bounds(family, expected):
 def test_tweedie_distribution_power():
     with pytest.raises(ValueError, match="no distribution exists"):
         TweedieDistribution(power=0.5)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         TweedieDistribution(power=1j)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         dist = TweedieDistribution()
         dist.power = 1j
 
@@ -89,11 +89,11 @@ def test_tweedie_distribution_parsing():
 
 
 def test_negative_binomial_distribution_alpha():
-    with pytest.raises(ValueError, match="must be strictly positive number"):
+    with pytest.raises(ValueError, match="must be strictly positive"):
         NegativeBinomialDistribution(theta=-0.5)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         NegativeBinomialDistribution(theta=1j)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         dist = NegativeBinomialDistribution()
         dist.theta = 1j
 
@@ -119,16 +119,24 @@ def test_negative_binomial_distribution_parsing():
 
 
 def test_equality():
-    assert TweedieDistribution(1) == TweedieDistribution(1)
-    assert TweedieDistribution(1) == PoissonDistribution()
-    assert TweedieDistribution(2) == GammaDistribution()
-    assert PoissonDistribution() == PoissonDistribution()
-    assert TweedieDistribution(1) != TweedieDistribution(1.5)
-    assert TweedieDistribution(1) != BinomialDistribution()
     assert BinomialDistribution() == BinomialDistribution()
-    assert NegativeBinomialDistribution(1) == NegativeBinomialDistribution(1)
-    assert NegativeBinomialDistribution(1) != NegativeBinomialDistribution(1.5)
+    assert GammaDistribution() == GammaDistribution()
     assert NegativeBinomialDistribution(1) != BinomialDistribution()
+    assert NegativeBinomialDistribution(1) != NegativeBinomialDistribution(1.5)
+    assert NegativeBinomialDistribution(1) == NegativeBinomialDistribution(1)
+    assert NormalDistribution() == NormalDistribution()
+    assert PoissonDistribution() == PoissonDistribution()
+    assert TweedieDistribution(0) != NormalDistribution()
+    assert TweedieDistribution(0) == NormalDistribution().to_tweedie()
+    assert TweedieDistribution(1) != BinomialDistribution()
+    assert TweedieDistribution(1) != PoissonDistribution()
+    assert TweedieDistribution(1) == PoissonDistribution().to_tweedie()
+    assert TweedieDistribution(1) != TweedieDistribution(1.5)
+    assert TweedieDistribution(1) == TweedieDistribution(1)
+    assert TweedieDistribution(2) != GammaDistribution()
+    assert TweedieDistribution(2) == GammaDistribution().to_tweedie()
+    assert TweedieDistribution(3) != InverseGaussianDistribution()
+    assert TweedieDistribution(3) == InverseGaussianDistribution().to_tweedie()
 
 
 @pytest.mark.parametrize(
diff --git a/tests/glm/test_link.py b/tests/glm/test_link.py
index 02052afb..7c4095d5 100644
--- a/tests/glm/test_link.py
+++ b/tests/glm/test_link.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from glum._link import CloglogLink, Link, LogitLink, LogLink, TweedieLink
+from glum._link import CloglogLink, IdentityLink, Link, LogitLink, LogLink, TweedieLink
 
 
 @pytest.mark.parametrize("link", Link.__subclasses__())
@@ -34,9 +34,13 @@ def test_link_properties(link):
 
 
 def test_equality():
-    assert TweedieLink(1.5) == TweedieLink(1.5)
-    assert TweedieLink(1) != LogLink()
+    assert IdentityLink() == IdentityLink()
+    assert LogitLink() == LogitLink()
     assert LogLink() == LogLink()
-    assert TweedieLink(1.5) != TweedieLink(2.5)
+    assert TweedieLink(0) != IdentityLink()
+    assert TweedieLink(0) == IdentityLink().to_tweedie()
     assert TweedieLink(1.5) != LogitLink()
-    assert LogitLink() == LogitLink()
+    assert TweedieLink(1.5) != TweedieLink(2.5)
+    assert TweedieLink(1.5) == TweedieLink(1.5)
+    assert TweedieLink(1) != LogLink()
+    assert TweedieLink(1) == LogLink().to_tweedie()

From 1dc4f3039ff06d51c8681f483f26e7706bed8420 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Fri, 8 Mar 2024 09:05:14 +0100
Subject: [PATCH 43/63] Explain `scale_predictors` more (#778)

* Expand on effect of scale_predictors and remove note

* Update src/glum/_glm.py

Co-authored-by: Jan Tilly <jan.tilly@quantco.com>

* remove sentence

---------

Co-authored-by: Jan Tilly <jan.tilly@quantco.com>
---
 src/glum/_glm.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 33f7ba51..71addd03 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -3127,10 +3127,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         set ``verbose`` to any positive number for verbosity.
 
     scale_predictors: bool, optional (default=False)
-        If ``True``, estimate a scaled model where all predictors have a
-        standard deviation of 1. This can result in better estimates if
-        predictors are on very different scales (for example, centimeters and
-        kilometers).
+        If ``True``, scale all predictors to have standard deviation one.
+        Should be set to ``True`` if ``alpha > 0`` and if you want coefficients
+        to be penalized equally.
+
+        Reported coefficient estimates are always at the original scale.
 
         Advanced developer note: Internally, predictors are always rescaled for
         computational reasons, but this only affects results if
@@ -3223,10 +3224,6 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
-    For ``alpha > 0``, the feature matrix ``X`` should be standardized in order
-    to penalize features equally strong. Call
-    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-
     If the target ``y`` is a ratio, appropriate sample weights ``s`` should be
     provided. As an example, consider Poisson distributed counts ``z``
     (integers) and weights ``s = exposure`` (time, money, persons years, ...).

From fb3a790ce81b7199e5780d76bb0fb60c6bc97e85 Mon Sep 17 00:00:00 2001
From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com>
Date: Mon, 25 Mar 2024 08:47:26 +0000
Subject: [PATCH 44/63] Move helpers into `_utils` (#782)

---
 src/glum/_glm.py  | 86 +++++++++++------------------------------------
 src/glum/_util.py | 48 ++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 71addd03..6a78c9d8 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -20,7 +20,6 @@
 import sys
 import warnings
 from collections.abc import Iterable, Mapping, Sequence
-from itertools import chain
 from typing import Any, NamedTuple, Optional, Union, cast
 
 import numpy as np
@@ -64,7 +63,13 @@
     _least_squares_solver,
     _trust_constr_solver,
 )
-from ._util import _add_missing_categories, _align_df_categories, _safe_toarray
+from ._util import (
+    _add_missing_categories,
+    _align_df_categories,
+    _expand_categorical_penalties,
+    _is_contiguous,
+    _safe_toarray,
+)
 
 _float_itemsize_to_dtype = {8: np.float64, 4: np.float32, 2: np.float16}
 
@@ -2642,15 +2647,6 @@ def _should_copy_X(self):
         # If self.copy_X is False, check for data of wrong dtype and error if it exists.
         return self.copy_X or False
 
-    def _is_contiguous(self, X):
-        if isinstance(X, np.ndarray):
-            return X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]
-        elif isinstance(X, pd.DataFrame):
-            return self._is_contiguous(X.values)
-        else:
-            # If not a numpy array or pandas data frame, we assume it is contiguous.
-            return True
-
     def _set_up_and_check_fit_args(
         self,
         X: ArrayLike,
@@ -2679,6 +2675,7 @@ def _set_up_and_check_fit_args(
         P2 = self.P2
 
         copy_X = self._should_copy_X()
+        drop_first = getattr(self, "drop_first", False)
 
         if isinstance(X, pd.DataFrame):
             if hasattr(self, "formula") and self.formula is not None:
@@ -2724,18 +2721,19 @@ def _set_up_and_check_fit_args(
                     )
 
                 self.X_model_spec_ = X.model_spec
-
                 self.feature_names_ = list(X.model_spec.column_names)
-                self.term_names_ = list(
-                    chain.from_iterable(
-                        [term] * len(cols) for term, _, cols in X.model_spec.structure
-                    )
-                )
+
+                self.term_names_ = [
+                    term
+                    for term, _, cols in X.model_spec.structure
+                    for _ in range(len(cols))
+                ]
 
             else:
                 # Maybe TODO: expand categorical penalties with formulas
 
                 self.feature_dtypes_ = X.dtypes.to_dict()
+
                 self.has_missing_category_ = {
                     col: (getattr(self, "cat_missing_method", "fail") == "convert")
                     and X[col].isna().any()
@@ -2744,62 +2742,16 @@ def _set_up_and_check_fit_args(
                 }
 
                 if any(X.dtypes == "category"):
-
-                    def _expand_categorical_penalties(
-                        penalty, X, drop_first, has_missing_category
-                    ):
-                        """
-                        If P1 or P2 has the same shape as X before expanding the
-                        categoricals, we assume that the penalty at the location of
-                        the categorical is the same for all levels.
-                        """
-                        if isinstance(penalty, str):
-                            return penalty
-                        if not sparse.issparse(penalty):
-                            penalty = np.asanyarray(penalty)
-
-                        if penalty.shape[0] == X.shape[1]:
-                            if penalty.ndim == 2:
-                                raise ValueError(
-                                    "When the penalty is two dimensional, it has "
-                                    "to have the same length as the number of "
-                                    "columns of X, after the categoricals "
-                                    "have been expanded."
-                                )
-                            return np.array(
-                                list(
-                                    chain.from_iterable(
-                                        (
-                                            [
-                                                elmt
-                                                for _ in range(
-                                                    len(dtype.categories)
-                                                    + has_missing_category[col]
-                                                    - drop_first
-                                                )
-                                            ]
-                                            if pd.api.types.is_categorical_dtype(dtype)
-                                            else [elmt]
-                                        )
-                                        for elmt, (col, dtype) in zip(
-                                            penalty, X.dtypes.items()
-                                        )
-                                    )
-                                )
-                            )
-                        else:
-                            return penalty
-
                     P1 = _expand_categorical_penalties(
-                        self.P1, X, self.drop_first, self.has_missing_category_
+                        self.P1, X, drop_first, self.has_missing_category_
                     )
                     P2 = _expand_categorical_penalties(
-                        self.P2, X, self.drop_first, self.has_missing_category_
+                        self.P2, X, drop_first, self.has_missing_category_
                     )
 
                 X = tm.from_pandas(
                     X,
-                    drop_first=self.drop_first,
+                    drop_first=drop_first,
                     categorical_format=getattr(  # convention prior to v3
                         self, "categorical_format", "{name}__{category}"
                     ),
@@ -2810,7 +2762,7 @@ def _expand_categorical_penalties(
         if y is None:
             raise ValueError("y cannot be None when not using a two-sided formula.")
 
-        if not self._is_contiguous(X):
+        if not _is_contiguous(X):
             if self.copy_X is not None and not self.copy_X:
                 raise ValueError(
                     "The X matrix is noncontiguous and copy_X = False."
diff --git a/src/glum/_util.py b/src/glum/_util.py
index b27d4fd8..0042d0ba 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -114,6 +114,54 @@ def _add_missing_categories(
     return df
 
 
+def _expand_categorical_penalties(
+    penalty, X, drop_first, has_missing_category
+) -> np.ndarray:
+"""Determine penalty matrices `P1` or `P2` after expanding categorical columns.
+
+If `P1` or `P2` has the same shape as `X` before expanding categorical columns, we assume that the penalty at the location of categorical columns is the same for all levels.
+"""
+
+    if isinstance(penalty, str):
+        return penalty
+    if not sparse.issparse(penalty):
+        penalty = np.asanyarray(penalty)
+
+    if penalty.shape[0] == X.shape[1]:
+
+        if penalty.ndim == 2:
+            raise ValueError(
+                "When the penalty is two-dimensional, it must have the "
+                "same length as the number of columns in the design "
+                "matrix `X` after expanding categorical columns."
+            )
+
+        expanded_penalty = []  # type: ignore
+
+        for element, (column, dt) in zip(penalty, X.dtypes.items()):
+            if isinstance(dt, pd.CategoricalDtype):
+                length = len(dt.categories) + has_missing_category[column] - drop_first
+                expanded_penalty.extend(element for _ in range(length))
+            else:
+                expanded_penalty.append(element)
+
+        return np.array(expanded_penalty)
+
+    else:
+
+        return penalty
+
+
+def _is_contiguous(X) -> bool:
+    if isinstance(X, np.ndarray):
+        return X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]
+    elif isinstance(X, pd.DataFrame):
+        return _is_contiguous(X.values)
+    else:
+        # If not a numpy array or pandas data frame, we assume it is contiguous.
+        return True
+
+
 def _safe_lin_pred(
     X: Union[MatrixBase, StandardizedMatrix],
     coef: np.ndarray,

From 6c83386bfef0c8415b076fca94bb481df9dfc250 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 25 Mar 2024 09:56:04 +0100
Subject: [PATCH 45/63] Patch docstring

---
 src/glum/_util.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/glum/_util.py b/src/glum/_util.py
index 0042d0ba..e0ef40ee 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -117,11 +117,12 @@ def _add_missing_categories(
 def _expand_categorical_penalties(
     penalty, X, drop_first, has_missing_category
 ) -> np.ndarray:
-"""Determine penalty matrices `P1` or `P2` after expanding categorical columns.
-
-If `P1` or `P2` has the same shape as `X` before expanding categorical columns, we assume that the penalty at the location of categorical columns is the same for all levels.
-"""
+    """Determine penalty matrices ``P1`` or ``P2`` after expanding categorical columns.
 
+    If ``P1`` or ``P2`` has the same shape as ``X`` before expanding categorical
+    columns, we assume that the penalty at the location of categorical columns
+    is the same for all levels.
+    """
     if isinstance(penalty, str):
         return penalty
     if not sparse.issparse(penalty):

From b8f6f8fdff0e26064927eace15101edc381c38b8 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Fri, 12 Apr 2024 10:53:04 +0200
Subject: [PATCH 46/63] Update CHANGELOG.rst

Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com>
---
 CHANGELOG.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 916cd007..b0b790f8 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,9 +12,9 @@ Changelog
 
 **Breaking changes:**
 
-- All arguments to :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor`, and :class:`GeneralizedLinearRegressorCV` are now keyword-only.
-- All arguments to public methods of :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` or :class:`GeneralizedLinearRegressorCV` except `X`, `y`, `sample_weight`, and `offset` are now keyword-only.
-- :class:`~glum.GeneralizedLinearRegressor`'s default value for `alpha` is now `0`, i.e. no regularization.
+- All arguments to :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` and :class:`GeneralizedLinearRegressorCV` are now keyword-only.
+- All arguments to public methods of :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` or :class:`GeneralizedLinearRegressorCV` except ``X``, ``y``, ``sample_weight`` and ``offset`` are now keyword-only.
+- :class:`~glum.GeneralizedLinearRegressor`'s default value for ``alpha`` is now ``0``, i.e. no regularization.
 - :class:`~glum.GammaDistribution`, :class:`~glum.InverseGaussianDistribution`, :class:`~glum.NormalDistribution` and :class:`~glum.PoissonDistribution` no longer inherit from :class:`~glum.TweedieDistribution`.
 - The power parameter of :class:`~glum.TweedieLink` has been renamed from ``p`` to ``power``, in line with :class:`~glum.TweedieDistribution`.
 - :class:`~glum.TweedieLink` no longer instantiates :class:`~glum.IdentityLink` or :class:`~glum.LogLink` for ``power=0`` and ``power=1``, respectively. On the other hand, :class:`~glum.TweedieLink` is now compatible with ``power=0`` and ``power=1``.
@@ -24,7 +24,7 @@ Changelog
 - Added a formula interface for specifying models.
 - Improved feature name handling. Feature names are now created for non-pandas input matrices too. Furthermore, the format of categorical features can be specified by the user.
 - Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
-- Added more options for treating missing values in categorical columns. They can either raise a `ValueError` (`"fail"`), be treated as all-zero indicators (`"zero"`) or represented as a new category (`"convert"`).
+- Added more options for treating missing values in categorical columns. They can either raise a ``ValueError`` (``"fail"``), be treated as all-zero indicators (``"zero"``) or represented as a new category (``"convert"``).
 - `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
 - :class:`~glum.InverseGaussianDistribution` gains a :meth:`~glum.InverseGaussianDistribution.log_likelihood` method.
 

From 326b99c0ca02c90fe9138603eb1e05afe9b06151 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Fri, 12 Apr 2024 14:31:12 +0200
Subject: [PATCH 47/63] Apply suggestions from code review

Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com>
---
 src/glum/_util.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/glum/_util.py b/src/glum/_util.py
index e0ef40ee..7dbd769f 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -57,13 +57,11 @@ def _align_df_categories(
             continue
 
         if cat_missing_method == "convert" and not has_missing_category[column]:
-            unseen_categories = set(df[column].unique()) - set(
-                dtypes[column].categories
-            )
+            unseen_categories = set(df[column].unique())
+            unseen_categories = unseen_categories - set(dtypes[column].categories)
         else:
-            unseen_categories = set(df[column].dropna().unique()) - set(
-                dtypes[column].categories
-            )
+            unseen_categories = set(df[column].dropna().unique())
+            unseen_categories = unseen_categories - set(dtypes[column].categories)
 
         if unseen_categories:
             raise ValueError(
@@ -91,7 +89,7 @@ def _add_missing_categories(
     categorical_dtypes = [
         column
         for column, dtype in dtypes.items()
-        if pd.api.types.is_categorical_dtype(dtype) and (column in df)
+        if isinstance(dtype, pd.CategoricalDtype) and (column in df)
     ]
 
     for column in categorical_dtypes:

From b512a5c2ac206262ad7c7565eb60421fbd868163 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 12 Apr 2024 16:07:59 +0200
Subject: [PATCH 48/63] shorten docstrings of private functions; typos in
 defaults; other suggestions

---
 src/glum/_glm.py    | 192 +++-----------------------------------------
 src/glum/_glm_cv.py |   4 +-
 2 files changed, 15 insertions(+), 181 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 6a78c9d8..0c10ea1d 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -246,14 +246,14 @@ def _parse_formula(
 
     Parameters
     ----------
-    formula : FormulaSpec
+    formula : formulaic.FormulaSpec
         The formula to parse.
     include_intercept: bool, default True
         Whether to include an intercept column.
 
     Returns
     -------
-    tuple[Formula, Formula]
+    tuple[formulaic.Formula, formulaic.Formula]
         The left-hand side and right-hand sides of the formula.
     """
     if isinstance(formula, str):
@@ -1735,52 +1735,10 @@ def _wald_test_matrix(
         expected_information=None,
         context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
-        """Compute the Wald test statistic and p-value for a linear hypothesis.
-
-        The hypothesis tested is ``R @ coef_ = r``. Under the null hypothesis,
-        the test statistic follows a chi-squared distribution with ``R.shape[0]``
-        degrees of freedom.
-
-        Parameters
-        ----------
-        R : np.ndarray
-            The restriction matrix representing the linear combination of coefficients
-            to test.
-        r : np.ndarray, optional, default=None
-            The vector representing the values of the linear combination.
-            If None, the test is for whether the linear combinations of the coefficients
-            are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        dispersion : float, optional, default=None
-            The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=None
-            Whether to compute robust standard errors instead of normal ones.
-            If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
-            Array with cluster membership. Clustered standard errors are
-            computed if clusters is not None.
-        expected_information : boolean, optional, default=None
-            Whether to use the expected or observed information matrix.
-            Only relevant when computing robust standard errors.
-            If not specified, the model's ``expected_information`` attribute is used.
-        context : Optional[Mapping[str, Any]], default=None
-            The context to use for evaluating the formula.
-
-        Returns
-        -------
-        WaldTestResult
-            NamedTuple with test statistic, p-value, and degrees of freedom.
+        """
+        Perform a Wald test statistic for a hypothesis specified by constraints
+        given as ``R @ coef_ = r``. Under the null hypothesis, the test statistic
+        follows a chi-squared distribution with ``R.shape[0]`` degrees of freedom.
         """
 
         covariance_matrix = self.covariance_matrix(
@@ -1843,49 +1801,9 @@ def _wald_test_feature_names(
         expected_information=None,
         context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
-        """Compute the Wald test statistic and p-value for a linear hypothesis.
-
+        """
         Perform a Wald test for the hypothesis that the coefficients of the
         features in ``features`` are equal to the values in ``values``.
-
-        Parameters
-        ----------
-        features: Union[str, list[str]]
-            The name of a feature or a list of features to test.
-        values: Sequence, optional, default=None
-            The values to which coefficients are compared. If None, the test is
-            for whether the coefficients are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        dispersion : float, optional, default=None
-            The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=None
-            Whether to compute robust standard errors instead of normal ones.
-            If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
-            Array with cluster membership. Clustered standard errors are
-            computed if clusters is not None.
-        expected_information : boolean, optional, default=None
-            Whether to use the expected or observed information matrix.
-            Only relevant when computing robust standard errors.
-            If not specified, the model's ``expected_information`` attribute is used.
-        context : Optional[Mapping[str, Any]], default=None
-            The context to use for evaluating the formula.
-
-        Returns
-        -------
-        WaldTestResult
-            NamedTuple with test statistic, p-value, and degrees of freedom.
         """
 
         if isinstance(features, str):
@@ -1942,46 +1860,8 @@ def _wald_test_formula(
         expected_information=None,
         context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
-        """Compute the Wald test statistic and p-value for a linear hypothesis.
-
+        """
         Perform a Wald test for the hypothesis described in ``formula``.
-
-        Parameters
-        ----------
-        formula: str
-            A formula string describing the linear restrictions. For more information,
-            see `meth:ModelSpec.get_linear_constraints` in ``formulaic``.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        dispersion : float, optional, default=None
-            The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=None
-            Whether to compute robust standard errors instead of normal ones.
-            If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
-            Array with cluster membership. Clustered standard errors are
-            computed if clusters is not None.
-        expected_information : boolean, optional, default=None
-            Whether to use the expected or observed information matrix.
-            Only relevant when computing robust standard errors.
-            If not specified, the model's ``expected_information`` attribute is used.
-        context : Optional[Mapping[str, Any]], default=None
-            The context to use for evaluating the formula.
-
-        Returns
-        -------
-        WaldTestResult
-            NamedTuple with test statistic, p-value, and degrees of freedom.
         """
 
         if self.fit_intercept:
@@ -2023,54 +1903,9 @@ def _wald_test_term_names(
         expected_information=None,
         context: Optional[Mapping[str, Any]] = None,
     ) -> WaldTestResult:
-        """Compute the Wald test statistic and p-value for a linear hypotheses.
-
+        """
         Perform a Wald test for the hypothesis that the coefficients of the
         features in ``terms`` are equal to the values in ``terms``.
-
-        Parameters
-        ----------
-        terms : Union[str, list[str]]
-            The name of a term or a list of terms to test. It can cover one or more
-            coefficients. In the case of a model based on a formula, a term is one
-            of the expressions separated by ``+`` signs. Otherwise, a term is one column
-            in the input data. As categorical variables need not be one-hot encoded in
-            glum, in their case, the hypothesis to be tested is that the coefficients
-            of all categories are equal to ``r``.
-        values: Sequence, optional, default=None
-            The values to which coefficients are compared. If None, the test is
-            for whether the coefficients are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Individual weights for each sample.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        dispersion : float, optional, default=None
-            The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=None
-            Whether to compute robust standard errors instead of normal ones.
-            If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
-            Array with clusters membership. Clustered standard errors are
-            computed if clusters is not None.
-        expected_information : boolean, optional, default=None
-            Whether to use the expected or observed information matrix.
-            Only relevant when computing robust std-errors.
-            If not specified, the model's ``expected_information`` attribute is used.
-        context : Optional[Mapping[str, Any]], default=None
-            The context to use for evaluating the formula.
-
-        Returns
-        -------
-        WaldTestResult
-            NamedTuple with test statistic, p-value and degrees of freedom.
         """
 
         if isinstance(terms, str):
@@ -2334,9 +2169,8 @@ def covariance_matrix(
                 "matrix will be incorrect."
             )
 
-        cannot_estimate_cov = X is None or (
-            y is None and not hasattr(self, "y_model_spec_")
-        )
+        cannot_estimate_cov = (y is None) and not hasattr(self, "y_model_spec_")
+        cannot_estimate_cov |= X is None
 
         if not skip_checks:
             if cannot_estimate_cov and self.covariance_matrix_ is None:
@@ -3125,12 +2959,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
 
-    formula : FormulaSpec
+    formula : formulaic.FormulaSpec
         A formula accepted by formulaic. It can either be a one-sided formula, in
         which case ``y`` must be specified in ``fit``, or a two-sided formula, in
         which case ``y`` must be ``None``.
 
-    interaction_separator: str, default ":"
+    interaction_separator: str, default=":"
         The separator between the names of interacted variables.
 
     categorical_format : str, optional, default='{name}[{category}]'
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index c84fb6f3..3f75edae 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -256,10 +256,10 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
         which case ``y`` must be specified in ``fit``, or a two-sided formula, in
         which case ``y`` must be ``None``.
 
-    interaction_separator: str, default ":"
+    interaction_separator: str, default=":"
         The separator between the names of interacted variables.
 
-    categorical_format: str, default "{name}[T.{category}]"
+    categorical_format: str, default="{name}[T.{category}]"
         The format string used to generate the names of categorical variables.
         Has to include the placeholders ``{name}`` and ``{category}``.
         Only used if ``formula`` is not ``None``.

From 05fd221622d0b43769ce99c3ee172ee6daa50a5c Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 12 Apr 2024 16:58:09 +0200
Subject: [PATCH 49/63] context docstring

---
 src/glum/_glm.py | 88 ++++++++++++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 33 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 0c10ea1d..ec5b6ef8 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1327,9 +1327,11 @@ def linear_predictor(
             Incompatible with ``alpha_index`` (see above).
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         Returns
         -------
@@ -1419,9 +1421,11 @@ def predict(
             Incompatible with ``alpha_index`` (see above).
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         Returns
         -------
@@ -1494,9 +1498,11 @@ def coef_table(
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         Returns
         -------
@@ -1625,9 +1631,11 @@ def wald_test(
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         Returns
         -------
@@ -2008,9 +2016,11 @@ def std_errors(
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -2093,9 +2103,11 @@ def covariance_matrix(
             Whether to skip input validation. For internal use only.
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         Notes
         -----
@@ -2357,9 +2369,11 @@ def score(
         offset : array-like, shape (n_samples,), optional (default=None)
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         Returns
         -------
@@ -3205,9 +3219,11 @@ def fit(
             computed if clusters is not None.
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
 
         weights_sum: float, optional (default=None)
 
@@ -3512,9 +3528,11 @@ def aic(
              Same data as used in 'fit'
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3553,9 +3571,11 @@ def aicc(
              Same data as used in 'fit'
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3598,9 +3618,11 @@ def bic(
              Same data as used in 'fit'
 
         context : Optional[Union[int, Mapping[str, Any]]], default=0
-            The context to use for evaluating the formula. If an integer, the
-            context is taken from the stack frame of the caller at the given
-            depth. If a dict, it is directly used as the context.
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. Only relevant
+            if ``self.formula`` is set.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context

From 2acdcbfc32395937a88fa344074a4fc6e5af189a Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Fri, 12 Apr 2024 16:39:00 +0100
Subject: [PATCH 50/63] kwargs

---
 src/glum/_glm.py | 188 +++++++----------------------------------------
 1 file changed, 25 insertions(+), 163 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index ec5b6ef8..54c42f6a 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1661,106 +1661,41 @@ def wald_test(
             context + 1 if isinstance(context, int) else context
         )
 
-        if R is not None:
-            return self._wald_test_matrix(
-                R=R,
-                r=r,
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-                offset=offset,
-                mu=mu,
-                dispersion=dispersion,
-                robust=robust,
-                clusters=clusters,
-                expected_information=expected_information,
-                context=captured_context,
-            )
+        kwargs = {
+            "X": X,
+            "y": y,
+            "sample_weight": sample_weight,
+            "offset": offset,
+            "mu": mu,
+            "dispersion": dispersion,
+            "robust": robust,
+            "clusters": clusters,
+            "expected_information": expected_information,
+            "context": captured_context,
+        }
 
+        if R is not None:
+            return self._wald_test_matrix(R=R, r=r, **kwargs)
         if features is not None:
-            return self._wald_test_feature_names(
-                features=features,
-                values=r,
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-                offset=offset,
-                mu=mu,
-                dispersion=dispersion,
-                robust=robust,
-                clusters=clusters,
-                expected_information=expected_information,
-                context=captured_context,
-            )
-
+            return self._wald_test_feature_names(features=features, values=r, **kwargs)
         if terms is not None:
-            return self._wald_test_term_names(
-                terms=terms,
-                values=r,
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-                offset=offset,
-                mu=mu,
-                dispersion=dispersion,
-                robust=robust,
-                clusters=clusters,
-                expected_information=expected_information,
-                context=captured_context,
-            )
-
+            return self._wald_test_term_names(terms=terms, values=r, **kwargs)
         if formula is not None:
             if r is not None:
                 raise ValueError("Cannot specify both formula and r")
-            return self._wald_test_formula(
-                formula=formula,
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-                offset=offset,
-                mu=mu,
-                dispersion=dispersion,
-                robust=robust,
-                clusters=clusters,
-                expected_information=expected_information,
-                context=captured_context,
-            )
+            return self._wald_test_formula(formula=formula, **kwargs)
 
         raise RuntimeError("This should never happen")
 
     def _wald_test_matrix(
-        self,
-        R: np.ndarray,
-        r: Optional[np.ndarray] = None,
-        X=None,
-        y=None,
-        sample_weight=None,
-        offset=None,
-        mu=None,
-        dispersion=None,
-        robust=None,
-        clusters: np.ndarray = None,
-        expected_information=None,
-        context: Optional[Mapping[str, Any]] = None,
+        self, R: np.ndarray, r: Optional[np.ndarray] = None, **kwargs
     ) -> WaldTestResult:
         """
         Perform a Wald test statistic for a hypothesis specified by constraints
         given as ``R @ coef_ = r``. Under the null hypothesis, the test statistic
         follows a chi-squared distribution with ``R.shape[0]`` degrees of freedom.
         """
-
-        covariance_matrix = self.covariance_matrix(
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            offset=offset,
-            mu=mu,
-            dispersion=dispersion,
-            robust=robust,
-            clusters=clusters,
-            expected_information=expected_information,
-            context=context,
-        )
+        covariance_matrix = self.covariance_matrix(**kwargs)
 
         if self.fit_intercept:
             beta = np.concatenate([[self.intercept_], self.coef_])
@@ -1798,16 +1733,7 @@ def _wald_test_feature_names(
         self,
         features: Union[str, list[str]],
         values: Optional[Sequence] = None,
-        X=None,
-        y=None,
-        sample_weight=None,
-        offset=None,
-        mu=None,
-        dispersion=None,
-        robust=None,
-        clusters: np.ndarray = None,
-        expected_information=None,
-        context: Optional[Mapping[str, Any]] = None,
+        **kwargs,
     ) -> WaldTestResult:
         """
         Perform a Wald test for the hypothesis that the coefficients of the
@@ -1839,35 +1765,9 @@ def _wald_test_feature_names(
                 raise ValueError(f"feature {feature} is not in the model") from None
             R[i, j] = 1
 
-        return self._wald_test_matrix(
-            R=R,
-            r=r,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            offset=offset,
-            mu=mu,
-            dispersion=dispersion,
-            robust=robust,
-            clusters=clusters,
-            expected_information=expected_information,
-            context=context,
-        )
+        return self._wald_test_matrix(R=R, r=r, **kwargs)
 
-    def _wald_test_formula(
-        self,
-        formula: str,
-        X=None,
-        y=None,
-        sample_weight=None,
-        offset=None,
-        mu=None,
-        dispersion=None,
-        robust=None,
-        clusters: np.ndarray = None,
-        expected_information=None,
-        context: Optional[Mapping[str, Any]] = None,
-    ) -> WaldTestResult:
+    def _wald_test_formula(self, formula: str, **kwargs) -> WaldTestResult:
         """
         Perform a Wald test for the hypothesis described in ``formula``.
         """
@@ -1881,35 +1781,10 @@ def _wald_test_formula(
 
         R, r = parser.get_matrix(formula)
 
-        return self._wald_test_matrix(
-            R=R,
-            r=r,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            offset=offset,
-            mu=mu,
-            dispersion=dispersion,
-            robust=robust,
-            clusters=clusters,
-            expected_information=expected_information,
-            context=context,
-        )
+        return self._wald_test_matrix(R=R, r=r, **kwargs)
 
     def _wald_test_term_names(
-        self,
-        terms: Union[str, list[str]],
-        values: Optional[Sequence] = None,
-        X=None,
-        y=None,
-        sample_weight=None,
-        offset=None,
-        mu=None,
-        dispersion=None,
-        robust=None,
-        clusters: np.ndarray = None,
-        expected_information=None,
-        context: Optional[Mapping[str, Any]] = None,
+        self, terms: Union[str, list[str]], values: Optional[Sequence] = None, **kwargs
     ) -> WaldTestResult:
         """
         Perform a Wald test for the hypothesis that the coefficients of the
@@ -1951,20 +1826,7 @@ def _wald_test_term_names(
         R = np.vstack(R_list)
         r = np.concatenate(r_list) if rhs else None
 
-        return self._wald_test_matrix(
-            R=R,
-            r=r,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            offset=offset,
-            mu=mu,
-            dispersion=dispersion,
-            robust=robust,
-            clusters=clusters,
-            expected_information=expected_information,
-            context=context,
-        )
+        return self._wald_test_matrix(R=R, r=r, **kwargs)
 
     def std_errors(
         self,

From 82cb60c93496a2dde87e6485dd0e25da9d6f4424 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 15 Apr 2024 13:23:17 +0200
Subject: [PATCH 51/63] no context as default; small cleanups

---
 .../formula_interface/formula_interface.ipynb | 146 ++++++++++++++----
 src/glum/_glm.py                              |  90 +++++------
 src/glum/_glm_cv.py                           |   7 +
 tests/glm/test_glm.py                         |  18 ++-
 4 files changed, 180 insertions(+), 81 deletions(-)

diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
index acdf50ea..b131bd98 100644
--- a/docs/tutorials/formula_interface/formula_interface.ipynb
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -972,15 +972,13 @@
     "\n",
     "The previous example is only scratching the surface of what formulas are capable of. For example, they are capable of evaluating arbitrary Python expressions, which act as if they saw the columns of the input data frame as local variables (`pandas.Series`). The way to tell `glum` that a part of the formula should be evaluated as a Python expression before applying the formula grammar to it is to enclose it in curly braces. As an example, we can easily do the following within the formula itself:\n",
     "\n",
-    " - Create the outcome variable on the fly instead of doing it beforehand.\n",
-    " - Include the logarithm of a certain variable in the model.<sup>1</sup>\n",
-    " - Include a basis spline interpolation of a variable to capture non-linearities in its effect.<sup>2</sup>\n",
+    " 1. Create the outcome variable on the fly instead of doing it beforehand.\n",
+    " 2. Include the logarithm of a certain variable in the model.\n",
+    " 3. Include a basis spline interpolation of a variable to capture non-linearities in its effect.\n",
     "\n",
-    "Let's try it out!\n",
+    "1\\. works because because formulas can contain [Python operations](https://matthewwardrop.github.io/formulaic/guides/grammar/). 2. and 3. work because formulas are evaluated within a context that is aware of a number of [transforms](https://matthewwardrop.github.io/formulaic/guides/transforms/). To be precise, 2. is a regular transform and 3. is a stateful transform.\n",
     "\n",
-    "<sup>1</sup>: This works because formulas can include variables from the local scope, such as the imported `numpy` namespace. (Even more precisely, certain often-used `numpy` functions are special-cased, so the curly braces are not even strictly necessary here.)\n",
-    "\n",
-    "<sup>2</sup>: `bs` is one of the several built-in `formulaic` functions that aim to simplify preprocessing steps. You can learn more about them [in `formulaic`'s docs](https://matthewwardrop.github.io/formulaic/guides/transforms/)."
+    "Let's try it out!"
    ]
   },
   {
@@ -1092,21 +1090,101 @@
    "source": [
     "formula_fun = (\n",
     "    \"{ClaimAmountCut / Exposure} ~ VehBrand + VehGas + Region + Area\"\n",
-    "    \" + DrivAge + VehAge + VehPower + bs(BonusMalus, 3) + {np.log(Density)}\"\n",
+    "    \" + DrivAge + VehAge + VehPower + bs(BonusMalus, 3) + np.log(Density)\"\n",
     ")\n",
     "\n",
-    "t_glm2 = GeneralizedLinearRegressor(\n",
+    "t_glm5 = GeneralizedLinearRegressor(\n",
     "    family=TweedieDist,\n",
     "    alpha_search=True,\n",
     "    l1_ratio=1,\n",
     "    fit_intercept=True,\n",
     "    formula=formula_fun,\n",
     ")\n",
-    "t_glm2.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "t_glm5.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm5.intercept_], t_glm5.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm5.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To allow for even more flexibility, you can add custom transformations that are defined in the context from which the call is made. E.g., we can define a transformation that takes the logarithm of ``VehAge + 1`` after casting it to numeric. To make the formula recognize this transform, you need to explicitly set ``context=0`` when calling the fit method (note that this differs from ``formulaic``'s default, which is already ``context=0``)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>_log_plus_one(VehAge)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>5.046712</td>\n",
+       "      <td>-0.151043</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  _log_plus_one(VehAge)\n",
+       "coefficient   5.046712              -0.151043"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def _log_plus_one(x):\n",
+    "    return np.log(pd.to_numeric(x) + 1)\n",
+    "\n",
+    "formula_custom_fun = (\n",
+    "    \"{ClaimAmountCut / Exposure} ~ _log_plus_one(VehAge)\"\n",
+    ")\n",
+    "\n",
+    "t_glm6 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_custom_fun,\n",
+    ")\n",
+    "t_glm6.fit(df_train, sample_weight=df[\"Exposure\"].values[train], context=0)\n",
     "\n",
     "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm2.intercept_], t_glm2.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm2.feature_names_,\n",
+    "    {\"coefficient\": np.concatenate(([t_glm6.intercept_], t_glm6.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm6.feature_names_,\n",
     ").T"
    ]
   },
@@ -1128,7 +1206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -1230,7 +1308,7 @@
        "[1 rows x 56 columns]"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1238,7 +1316,7 @@
    "source": [
     "formula_name = \"PurePremium ~ DrivAge * VehPower\"\n",
     "\n",
-    "t_glm5 = GeneralizedLinearRegressor(\n",
+    "t_glm7 = GeneralizedLinearRegressor(\n",
     "    family=TweedieDist,\n",
     "    alpha_search=True,\n",
     "    l1_ratio=1,\n",
@@ -1247,11 +1325,11 @@
     "    interaction_separator=\"__x__\",\n",
     "    categorical_format=\"{name}__{category}\",\n",
     ")\n",
-    "t_glm5.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "t_glm7.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
     "\n",
     "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm5.intercept_], t_glm5.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm5.feature_names_,\n",
+    "    {\"coefficient\": np.concatenate(([t_glm7.intercept_], t_glm7.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm7.feature_names_,\n",
     ").T"
    ]
   },
@@ -1266,14 +1344,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
     "formula_noint = \"PurePremium ~ DrivAge * VehPower - 1\"\n",
     "\n",
     "with pytest.raises(ValueError, match=\"The formula sets the intercept to False\"):\n",
-    "    t_glm6 = GeneralizedLinearRegressor(\n",
+    "    t_glm8 = GeneralizedLinearRegressor(\n",
     "        family=TweedieDist,\n",
     "        alpha_search=True,\n",
     "        l1_ratio=1,\n",
@@ -1281,7 +1359,7 @@
     "        formula=formula_noint,\n",
     "        interaction_separator=\"__x__\",\n",
     "        categorical_format=\"{name}__{category}\",\n",
-    "    )"
+    "    ).fit(df_train, sample_weight=df[\"Exposure\"].values[train])"
    ]
   },
   {
@@ -1295,7 +1373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -1397,7 +1475,7 @@
        "[1 rows x 56 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1405,7 +1483,7 @@
    "source": [
     "formula_onesie = \"DrivAge * VehPower\"\n",
     "\n",
-    "t_glm7 = GeneralizedLinearRegressor(\n",
+    "t_glm8 = GeneralizedLinearRegressor(\n",
     "    family=TweedieDist,\n",
     "    alpha_search=True,\n",
     "    l1_ratio=1,\n",
@@ -1414,13 +1492,13 @@
     "    interaction_separator=\"__x__\",\n",
     "    categorical_format=\"{name}__{category}\",\n",
     ")\n",
-    "t_glm7.fit(\n",
+    "t_glm8.fit(\n",
     "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
     ")\n",
     "\n",
     "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm7.intercept_], t_glm7.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm7.feature_names_,\n",
+    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
     ").T"
    ]
   },
@@ -1437,7 +1515,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -1540,7 +1618,7 @@
        "coefficient                                  4.970188  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1548,7 +1626,7 @@
    "source": [
     "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n",
     "\n",
-    "t_glm8 = GeneralizedLinearRegressor(\n",
+    "t_glm9 = GeneralizedLinearRegressor(\n",
     "    family=TweedieDist,\n",
     "    alpha_search=True,\n",
     "    l1_ratio=1,\n",
@@ -1556,13 +1634,13 @@
     "    formula=formula_missing,\n",
     "\n",
     ")\n",
-    "t_glm8.fit(\n",
+    "t_glm9.fit(\n",
     "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
     ")\n",
     "\n",
     "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
+    "    {\"coefficient\": np.concatenate(([t_glm9.intercept_], t_glm9.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm9.feature_names_,\n",
     ").T"
    ]
   }
@@ -1583,7 +1661,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.2"
   },
   "orig_nbformat": 4
  },
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 54c42f6a..924358df 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1301,7 +1301,7 @@ def linear_predictor(
         *,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Compute the linear predictor, ``X * coef_ + intercept_``.
 
@@ -1326,12 +1326,12 @@ def linear_predictor(
             Sets the alpha(s) to use in case ``alpha_search`` is ``True``.
             Incompatible with ``alpha_index`` (see above).
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         Returns
         -------
@@ -1392,7 +1392,7 @@ def predict(
         *,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Predict using GLM with feature matrix ``X``.
 
@@ -1420,12 +1420,12 @@ def predict(
             Sets the alpha(s) to use in case ``alpha_search`` is ``True``.
             Incompatible with ``alpha_index`` (see above).
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         Returns
         -------
@@ -1462,7 +1462,7 @@ def coef_table(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Get a table of of the regression coefficients.
 
@@ -1497,12 +1497,12 @@ def coef_table(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         Returns
         -------
@@ -1572,7 +1572,7 @@ def wald_test(
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypothesis.
 
@@ -1630,12 +1630,12 @@ def wald_test(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         Returns
         -------
@@ -1841,7 +1841,7 @@ def std_errors(
         clusters: np.ndarray = None,
         expected_information=None,
         store_covariance_matrix=False,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Calculate standard errors for generalized linear models.
 
@@ -1877,12 +1877,12 @@ def std_errors(
         store_covariance_matrix : boolean, optional, default=False
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -1918,7 +1918,7 @@ def covariance_matrix(
         expected_information=None,
         store_covariance_matrix=False,
         skip_checks=False,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Calculate the covariance matrix for generalized linear models.
 
@@ -1964,12 +1964,12 @@ def covariance_matrix(
         skip_checks : boolean, optional, default=False
             Whether to skip input validation. For internal use only.
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         Notes
         -----
@@ -2203,7 +2203,7 @@ def score(
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
         *,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Compute :math:`D^2`, the percentage of deviance explained.
 
@@ -2230,12 +2230,12 @@ def score(
 
         offset : array-like, shape (n_samples,), optional (default=None)
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         Returns
         -------
@@ -3037,7 +3037,7 @@ def fit(
         clusters: Optional[np.ndarray] = None,
         # TODO: take out weights_sum (or use it properly)
         weights_sum: Optional[float] = None,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Fit a Generalized Linear Model.
 
@@ -3080,12 +3080,12 @@ def fit(
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
 
         weights_sum: float, optional (default=None)
 
@@ -3368,7 +3368,7 @@ def aic(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         *,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """
         Akaike's information criteria. Computed as:
@@ -3387,14 +3387,14 @@ def aic(
             Same data as used in 'fit'
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
-             Same data as used in 'fit'
+            Same data as used in 'fit'
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3409,7 +3409,7 @@ def aicc(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         *,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """
         Second-order Akaike's information criteria (or small sample AIC).
@@ -3432,12 +3432,12 @@ def aicc(
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3457,7 +3457,7 @@ def bic(
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         *,
-        context: Optional[Union[int, Mapping[str, Any]]] = 0,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """
         Bayesian information criterion. Computed as:
@@ -3479,12 +3479,12 @@ def bic(
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
 
-        context : Optional[Union[int, Mapping[str, Any]]], default=0
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
             The context to add to the evaluation context of the formula with,
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
-            mapping from variable names to values is expected. Only relevant
-            if ``self.formula`` is set.
+            mapping from variable names to values is expected. By default,
+            no context is added.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 3f75edae..07fabcce 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -471,6 +471,13 @@ def fit(
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added.
+
         """
         self._validate_hyperparameters()
 
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 4645f2c2..c77a4bbd 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -3173,12 +3173,24 @@ def test_formula_context(get_mixed_data):
     data = get_mixed_data
     x_context = np.arange(len(data), dtype=float)  # noqa: F841
     formula = "y ~ x1 + x2 + x_context"
+
     model_formula = GeneralizedLinearRegressor(
         family="normal",
         drop_first=True,
         formula=formula,
         fit_intercept=True,
-    ).fit(data)
+    )
+    # default is to add nothing to context
+    with pytest.raises(formulaic.errors.FactorEvaluationError):
+        model_formula.fit(data)
+
+    # set context to 0 to capture calling scope
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        fit_intercept=True,
+    ).fit(data, context=0)
 
     model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
 
@@ -3186,7 +3198,9 @@ def test_formula_context(get_mixed_data):
         np.concatenate([[model_formula.intercept_], model_formula.coef_]),
         model_smf.params,
     )
-    np.testing.assert_almost_equal(model_formula.predict(data), model_smf.predict(data))
+    np.testing.assert_almost_equal(
+        model_formula.predict(data, context=0), model_smf.predict(data)
+    )
 
 
 @pytest.mark.parametrize(

From 517522b9a96d82ff6a4d48d32cfba4d0c6708660 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 15 Apr 2024 16:06:16 +0200
Subject: [PATCH 52/63] add explanation to get calling scope

---
 src/glum/_glm.py    | 33 ++++++++++++++++++++++-----------
 src/glum/_glm_cv.py |  3 ++-
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 924358df..d65b6269 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -1331,7 +1331,8 @@ def linear_predictor(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -1425,7 +1426,8 @@ def predict(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -1502,7 +1504,8 @@ def coef_table(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -1635,7 +1638,8 @@ def wald_test(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -1882,7 +1886,8 @@ def std_errors(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -1969,7 +1974,8 @@ def covariance_matrix(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Notes
         -----
@@ -2235,7 +2241,8 @@ def score(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -3085,7 +3092,8 @@ def fit(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         weights_sum: float, optional (default=None)
 
@@ -3394,7 +3402,8 @@ def aic(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3437,7 +3446,8 @@ def aicc(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
@@ -3484,7 +3494,8 @@ def bic(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
         captured_context = capture_context(
             context + 1 if isinstance(context, int) else context
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 07fabcce..8a239c83 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -476,7 +476,8 @@ def fit(
             e.g., custom transforms. If an integer, the context is taken from
             the stack frame of the caller at the given depth. Otherwise, a
             mapping from variable names to values is expected. By default,
-            no context is added.
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         """
         self._validate_hyperparameters()

From a121dbe52e01039b9540385e77c7a59cdc189d1d Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Tue, 23 Apr 2024 13:07:47 +0200
Subject: [PATCH 53/63] adjust to tabmat release

---
 .github/workflows/conda-build.yml | 14 +++++++-------
 .github/workflows/daily.yml       |  2 +-
 conda.recipe/meta.yaml            |  2 +-
 environment-benchmark.yml         |  1 -
 environment.yml                   |  3 +--
 setup.py                          |  2 +-
 6 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index 57f2040b..8762a3ec 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -20,13 +20,13 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { conda_build_yml: linux_64_python3.9.____cpython,   os: ubuntu-latest,  conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
-          - { conda_build_yml: linux_64_python3.12.____cpython,  os: ubuntu-latest,  conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
-          - { conda_build_yml: osx_64_python3.9.____cpython,     os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
-          - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
-          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge --no-test' }
-          - { conda_build_yml: win_64_python3.9.____cpython,     os: windows-latest, conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
-          - { conda_build_yml: win_64_python3.12.____cpython,    os: windows-latest, conda-build-args: ' -c conda-forge/label/tabmat_dev -c conda-forge' }
+          - { conda_build_yml: linux_64_python3.9.____cpython,   os: ubuntu-latest,  conda-build-args: '' }
+          - { conda_build_yml: linux_64_python3.12.____cpython,  os: ubuntu-latest,  conda-build-args: '' }
+          - { conda_build_yml: osx_64_python3.9.____cpython,     os: macos-latest,   conda-build-args: '' }
+          - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: '' }
+          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: '--no-test' }
+          - { conda_build_yml: win_64_python3.9.____cpython,     os: windows-latest, conda-build-args: '' }
+          - { conda_build_yml: win_64_python3.12.____cpython,    os: windows-latest, conda-build-args: '' }
     steps:
       - name: Checkout branch
         uses: actions/checkout@v4
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index 85f43c4f..4d1b1ba1 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -48,7 +48,7 @@ jobs:
           pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre --no-deps pyarrow
           echo Install tabmat nightly
           micromamba remove -y --force tabmat
-          pip install --no-use-pep517 --no-deps git+https://github.com/Quantco/tabmat@tabmat-v4
+          pip install --no-use-pep517 --no-deps git+https://github.com/Quantco/tabmat
       - name: Install repository
         shell: bash -el {0}
         run: pip install --no-use-pep517 --no-deps --disable-pip-version-check -e .
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 35218f7c..01109295 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - scikit-learn >=0.23
     - scipy
     - formulaic >=0.6
-    - tabmat >=4.0.0a3
+    - tabmat >=4.0.0
 
 test:
   requires:
diff --git a/environment-benchmark.yml b/environment-benchmark.yml
index 3dd449de..6c189af5 100644
--- a/environment-benchmark.yml
+++ b/environment-benchmark.yml
@@ -1,6 +1,5 @@
 name: glum
 channels:
-  - conda-forge/label/tabmat_dev
   - conda-forge
   - nodefaults
 dependencies:
diff --git a/environment.yml b/environment.yml
index d0d7d172..5de79568 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,5 @@
 name: glum
 channels:
-  - conda-forge/label/tabmat_dev
   - conda-forge
   - nodefaults
 dependencies:
@@ -9,7 +8,7 @@ dependencies:
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
   - pandas>=0.21
-  - tabmat>=4.0.0a3
+  - tabmat>=4.0.0
   - scikit-learn>=0.23
   - scipy
   - tqdm
diff --git a/setup.py b/setup.py
index a647611e..0b061cc8 100644
--- a/setup.py
+++ b/setup.py
@@ -87,7 +87,7 @@
         "scikit-learn>=0.23",
         "scipy",
         "formulaic>=0.6",
-        "tabmat>=4.0.0a3",
+        "tabmat>=4.0.0",
     ],
     entry_points=(
         None

From 44125508a7a29b7c3bfd265d31fafb155046e817 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Tue, 23 Apr 2024 13:08:58 +0200
Subject: [PATCH 54/63] keep whitespace

---
 .github/workflows/conda-build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index 8762a3ec..ef937200 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -24,7 +24,7 @@ jobs:
           - { conda_build_yml: linux_64_python3.12.____cpython,  os: ubuntu-latest,  conda-build-args: '' }
           - { conda_build_yml: osx_64_python3.9.____cpython,     os: macos-latest,   conda-build-args: '' }
           - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: '' }
-          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: '--no-test' }
+          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: ' --no-test' }
           - { conda_build_yml: win_64_python3.9.____cpython,     os: windows-latest, conda-build-args: '' }
           - { conda_build_yml: win_64_python3.12.____cpython,    os: windows-latest, conda-build-args: '' }
     steps:

From 18d5b0e490cfaeb68b6d3d1823e3bbb6324f43c2 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 26 Apr 2024 10:19:53 +0200
Subject: [PATCH 55/63] temporarily add tabmat_dev channel again to investigate
 env solving failure on CI

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index 5de79568..f97b4a0a 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,5 +1,6 @@
 name: glum
 channels:
+  - conda-forge/label/tabmat_dev
   - conda-forge
   - nodefaults
 dependencies:

From aeeb19e5ef0ffccbd014a8722b285c00c527d428 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 26 Apr 2024 10:23:47 +0200
Subject: [PATCH 56/63] remove tabmat_dev channel again

---
 environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index f97b4a0a..5de79568 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,5 @@
 name: glum
 channels:
-  - conda-forge/label/tabmat_dev
   - conda-forge
   - nodefaults
 dependencies:

From 74550c33299a64dacaca03fd016a312dd5d50e15 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 26 Apr 2024 16:37:42 +0200
Subject: [PATCH 57/63] for now, disable conda build test on osx and Python
 3.12

---
 .github/workflows/conda-build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index ef937200..8b8756f3 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -23,7 +23,7 @@ jobs:
           - { conda_build_yml: linux_64_python3.9.____cpython,   os: ubuntu-latest,  conda-build-args: '' }
           - { conda_build_yml: linux_64_python3.12.____cpython,  os: ubuntu-latest,  conda-build-args: '' }
           - { conda_build_yml: osx_64_python3.9.____cpython,     os: macos-latest,   conda-build-args: '' }
-          - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: '' }
+          - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: ' --no-test' }
           - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: ' --no-test' }
           - { conda_build_yml: win_64_python3.9.____cpython,     os: windows-latest, conda-build-args: '' }
           - { conda_build_yml: win_64_python3.12.____cpython,    os: windows-latest, conda-build-args: '' }

From 4b8b84ce7fafa2447093af47563bcf1f6db28fef Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher
 <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com>
Date: Fri, 26 Apr 2024 19:04:27 +0200
Subject: [PATCH 58/63] Add a different environment for macos (#786)

* try solving on ci with different env for macos

* add missing if

* typo

* try and remove --no-test flag
---
 .github/workflows/ci.yml          | 12 +++++++-
 .github/workflows/conda-build.yml |  4 +--
 environment-macos.yml             | 51 +++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 environment-macos.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 11cdfdf4..fc4cc3c2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,7 +40,8 @@ jobs:
     steps:
       - name: Checkout branch
         uses: actions/checkout@v4
-      - name: Set up conda env
+      - name: Set up conda env (windows and ubuntu)
+        if: matrix.os != 'macos-latest'
         uses: mamba-org/setup-micromamba@8767fb704bd78032e9392f0386bf46950bdd1194
         with:
           environment-file: environment.yml
@@ -48,6 +49,15 @@ jobs:
           cache-environment: true
           create-args: >-
             python=${{ matrix.python-version }}
+      - name: Set up conda env (macos)
+        if: matrix.os == 'macos-latest'
+        uses: mamba-org/setup-micromamba@8767fb704bd78032e9392f0386bf46950bdd1194
+        with:
+          environment-file: environment-macos.yml
+          init-shell: bash
+          cache-environment: true
+          create-args: >-
+            python=${{ matrix.python-version }}
       - name: Install repository (unix)
         if: matrix.os != 'windows-latest'
         shell: bash -el {0}
diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index 8b8756f3..89c9dba3 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -23,8 +23,8 @@ jobs:
           - { conda_build_yml: linux_64_python3.9.____cpython,   os: ubuntu-latest,  conda-build-args: '' }
           - { conda_build_yml: linux_64_python3.12.____cpython,  os: ubuntu-latest,  conda-build-args: '' }
           - { conda_build_yml: osx_64_python3.9.____cpython,     os: macos-latest,   conda-build-args: '' }
-          - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: ' --no-test' }
-          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: ' --no-test' }
+          - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: '' }
+          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: '' }
           - { conda_build_yml: win_64_python3.9.____cpython,     os: windows-latest, conda-build-args: '' }
           - { conda_build_yml: win_64_python3.12.____cpython,    os: windows-latest, conda-build-args: '' }
     steps:
diff --git a/environment-macos.yml b/environment-macos.yml
new file mode 100644
index 00000000..ad813f7f
--- /dev/null
+++ b/environment-macos.yml
@@ -0,0 +1,51 @@
+name: glum
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  # required for users (note: this is not where you specify new dependencies
+  # for the conda packages. please put those `conda.recipe/meta.yaml`!!
+  - numexpr
+  - pandas>=0.21
+  - tabmat>=4.0.0
+  - scikit-learn>=0.23
+  - scipy
+  - tqdm
+  - formulaic>=0.6
+
+  # development tools
+  - black
+  - flake8
+  - git_root
+  - ipdb
+  - ipython
+  - line_profiler
+  - memory_profiler
+  - pip
+  - pre-commit
+  - pyarrow
+  - pytest
+  - pytest-xdist
+  - setuptools_scm
+
+  # build tools
+  - c-compiler
+  - cxx-compiler
+  - cython
+
+  # required for tests
+  - statsmodels
+
+  # documentation dev
+  - jinja2
+  - jupyterlab
+  - jupytext
+  - make
+  - matplotlib-base
+  - nbclassic>=0.2.8
+  - nbsphinx>=0.8.3
+  - sphinx>=3.5.3
+  - sphinx_rtd_theme
+  - sphinxcontrib-apidoc
+  - sphinxext-altair
+

From 745a6a0158d3d1ef2512cd39f28303a4e4f07df7 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 26 Apr 2024 21:00:29 +0200
Subject: [PATCH 59/63] replace deprecated scipy.sparse.*_matrix.A

---
 src/glum_benchmarks/data/simulated_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum_benchmarks/data/simulated_glm.py b/src/glum_benchmarks/data/simulated_glm.py
index 19a3175d..9d5c087c 100644
--- a/src/glum_benchmarks/data/simulated_glm.py
+++ b/src/glum_benchmarks/data/simulated_glm.py
@@ -104,7 +104,7 @@ def simulate_glm_data(
 
     # Creating sparse component
     sparse_feature_names = [f"sparse{i}" for i in range(sparse_features)]
-    X_sparse = sps.random(n_rows, sparse_features, density=sparse_density).A
+    X_sparse = sps.random(n_rows, sparse_features, density=sparse_density).toarray()
     X_sparse = pd.DataFrame(data=X_sparse, columns=sparse_feature_names)
     coefs_sparse = rand.choice([0, 1, -1], size=sparse_features)
     coefs_sparse = pd.Series(data=coefs_sparse, index=sparse_feature_names)

From 9f570805b9c311251464f69a57a85838252c7136 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 26 Apr 2024 21:20:00 +0200
Subject: [PATCH 60/63] replace other instance of .A

---
 tests/glm/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index c77a4bbd..f29177f3 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -753,7 +753,7 @@ def test_glm_identity_regression_categorical_data(solver, offset, convert_x_fn):
         gradient_tol=1e-7,
     )
     X = convert_x_fn(x_mat)
-    np.testing.assert_almost_equal(X.A if hasattr(X, "A") else X, x_mat)
+    np.testing.assert_almost_equal(X.toarray() if hasattr(X, "toarray") else X, x_mat)
     res = glm.fit(X, y, offset=offset)
 
     assert_allclose(res.coef_, coef, rtol=1e-6)

From 891fed247571733ed57e96d9034b53481acd63f2 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Fri, 26 Apr 2024 21:43:26 +0200
Subject: [PATCH 61/63] two more

---
 tests/glm/test_cv_glm.py | 4 ++--
 tests/glm/test_glm.py    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/glm/test_cv_glm.py b/tests/glm/test_cv_glm.py
index 605039ff..c8dd2923 100644
--- a/tests/glm/test_cv_glm.py
+++ b/tests/glm/test_cv_glm.py
@@ -48,8 +48,8 @@ def test_normal_elastic_net_comparison(l1_ratio, fit_intercept, convert_x_fn):
     y = y[0:n_samples]
     X, T = X[0:n_samples], X[n_samples:]
 
-    x_arr = X if isinstance(X, np.ndarray) else X.A
-    t_arr = T if isinstance(T, np.ndarray) else T.A
+    x_arr = X if isinstance(X, np.ndarray) else X.toarray()
+    t_arr = T if isinstance(T, np.ndarray) else T.toarray()
     elastic_net = ElasticNetCV(
         l1_ratio=l1_ratio,
         n_alphas=n_alphas,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index f29177f3..3276ccce 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -721,7 +721,7 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn):
     if isinstance(X, np.ndarray):
         np.testing.assert_almost_equal(X, X_before)
     else:
-        np.testing.assert_almost_equal(X.A, X_before.A)
+        np.testing.assert_almost_equal(X.A, X_before.toarray())
 
 
 @pytest.mark.parametrize("solver", GLM_SOLVERS)

From d003e41471f9b7d2d7799b101ef9c6ee063e32c5 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Sat, 27 Apr 2024 10:51:52 +0200
Subject: [PATCH 62/63] simply replace all instances of .A by .toarray()
 (tabmat knows both)

---
 src/glum/_glm.py      | 4 ++--
 src/glum/_util.py     | 2 +-
 tests/glm/test_glm.py | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index ed285df7..b6144903 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -2093,7 +2093,7 @@ def covariance_matrix(
                 return self.covariance_matrix_
 
             if hasattr(self, "y_model_spec_"):
-                y = self.y_model_spec_.get_model_matrix(X).A.ravel()
+                y = self.y_model_spec_.get_model_matrix(X).toarray().ravel()
                 # This has to go first because X is modified in the next line
 
             if isinstance(X, pd.DataFrame):
@@ -2417,7 +2417,7 @@ def _set_up_and_check_fit_args(
                     )
 
                     self.y_model_spec_ = y.model_spec
-                    y = y.A.ravel()
+                    y = y.toarray().ravel()
 
                 X = tm.from_formula(
                     formula=rhs,
diff --git a/src/glum/_util.py b/src/glum/_util.py
index 7dbd769f..83d30fe0 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -194,7 +194,7 @@ def _safe_sandwich_dot(
     """
     result = X.sandwich(d, rows, cols)
     if isinstance(result, sparse.dia_matrix):
-        result = result.A
+        result = result.toarray()
 
     if intercept:
         dim = result.shape[0] + 1
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 3276ccce..c4dcdd22 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -721,7 +721,7 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn):
     if isinstance(X, np.ndarray):
         np.testing.assert_almost_equal(X, X_before)
     else:
-        np.testing.assert_almost_equal(X.A, X_before.toarray())
+        np.testing.assert_almost_equal(X.toarray(), X_before.toarray())
 
 
 @pytest.mark.parametrize("solver", GLM_SOLVERS)
@@ -1455,18 +1455,18 @@ def _arrays_share_data(arr1: np.ndarray, arr2: np.ndarray) -> bool:
     # After standardization, all the columns will have the same values.
     # To check that, just convert to dense first.
     if use_sparse:
-        Xdense = X.A
+        Xdense = X.toarray()
     else:
         Xdense = X
     for i in range(1, NC):
         if scale_predictors:
             if isinstance(Xdense, tm.StandardizedMatrix):
-                one, two = Xdense.A[:, 0], Xdense.A[:, i]
+                one, two = Xdense.toarray()[:, 0], Xdense.toarray()[:, i]
             else:
                 one, two = Xdense[:, 0], Xdense[:, i]
         else:
             if isinstance(Xdense, tm.StandardizedMatrix):
-                one, two = (i + 1) * Xdense.A[:, 0], Xdense.A[:, i]
+                one, two = (i + 1) * Xdense.toarray()[:, 0], Xdense.toarray()[:, i]
             else:
                 one, two = (i + 1) * Xdense[:, 0], Xdense[:, i]
         np.testing.assert_almost_equal(one, two)

From 5fd62a0b9440b3a91c12d0d64153524557e19cb6 Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Sat, 27 Apr 2024 19:28:41 +0200
Subject: [PATCH 63/63] update CHANGELOG for release

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b0b790f8..57154802 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,7 @@
 Changelog
 =========
 
-3.0.0 - UNRELEASED
+3.0.0 - 2024-04-27
 ------------------
 
 **Breaking changes:**