diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1ccd6128f..a1d214528 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,7 +40,8 @@ jobs:
     steps:
       - name: Checkout branch
         uses: actions/checkout@v4
-      - name: Set up conda env
+      - name: Set up conda env (windows and ubuntu)
+        if: matrix.os != 'macos-latest'
         uses: mamba-org/setup-micromamba@422500192359a097648154e8db4e39bdb6c6eed7
         with:
           environment-file: environment.yml
@@ -48,6 +49,15 @@ jobs:
           cache-environment: true
           create-args: >-
             python=${{ matrix.python-version }}
+      - name: Set up conda env (macos)
+        if: matrix.os == 'macos-latest'
+        uses: mamba-org/setup-micromamba@422500192359a097648154e8db4e39bdb6c6eed7
+        with:
+          environment-file: environment-macos.yml
+          init-shell: bash
+          cache-environment: true
+          create-args: >-
+            python=${{ matrix.python-version }}
       - name: Install repository (unix)
         if: matrix.os != 'windows-latest'
         shell: bash -el {0}
diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
index bc1447fab..8a6000572 100644
--- a/.github/workflows/conda-build.yml
+++ b/.github/workflows/conda-build.yml
@@ -24,7 +24,7 @@ jobs:
           - { conda_build_yml: linux_64_python3.12.____cpython,  os: ubuntu-latest,  conda-build-args: '' }
           - { conda_build_yml: osx_64_python3.9.____cpython,     os: macos-latest,   conda-build-args: '' }
           - { conda_build_yml: osx_64_python3.12.____cpython,    os: macos-latest,   conda-build-args: '' }
-          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: ' --no-test' }
+          - { conda_build_yml: osx_arm64_python3.10.____cpython, os: macos-latest,   conda-build-args: '' }
           - { conda_build_yml: win_64_python3.9.____cpython,     os: windows-latest, conda-build-args: '' }
           - { conda_build_yml: win_64_python3.12.____cpython,    os: windows-latest, conda-build-args: '' }
     steps:
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 1d77b6e19..571548022 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,28 @@
 Changelog
 =========
 
+3.0.0 - 2024-04-27
+------------------
+
+**Breaking changes:**
+
+- All arguments to :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` and :class:`GeneralizedLinearRegressorCV` are now keyword-only.
+- All arguments to public methods of :class:`~glum.GeneralizedLinearRegressorBase`, :class:`~glum.GeneralizedLinearRegressor` or :class:`GeneralizedLinearRegressorCV` except ``X``, ``y``, ``sample_weight`` and ``offset`` are now keyword-only.
+- :class:`~glum.GeneralizedLinearRegressor`'s default value for ``alpha`` is now ``0``, i.e. no regularization.
+- :class:`~glum.GammaDistribution`, :class:`~glum.InverseGaussianDistribution`, :class:`~glum.NormalDistribution` and :class:`~glum.PoissonDistribution` no longer inherit from :class:`~glum.TweedieDistribution`.
+- The power parameter of :class:`~glum.TweedieLink` has been renamed from ``p`` to ``power``, in line with :class:`~glum.TweedieDistribution`.
+- :class:`~glum.TweedieLink` no longer instantiates :class:`~glum.IdentityLink` or :class:`~glum.LogLink` for ``power=0`` and ``power=1``, respectively. On the other hand, :class:`~glum.TweedieLink` is now compatible with ``power=0`` and ``power=1``.
+
+**New features:**
+
+- Added a formula interface for specifying models.
+- Improved feature name handling. Feature names are now created for non-pandas input matrices too. Furthermore, the format of categorical features can be specified by the user.
+- Term names are now stored in the model's attributes. This is useful for categorical features, where they refer to the whole variable, not just single levels.
+- Added more options for treating missing values in categorical columns. They can either raise a ``ValueError`` (``"fail"``), be treated as all-zero indicators (``"zero"``) or represented as a new category (``"convert"``).
+- `meth:GeneralizedLinearRegressor.wald_test` can now perform tests based on a formula string and term names.
+- :class:`~glum.InverseGaussianDistribution` gains a :meth:`~glum.InverseGaussianDistribution.log_likelihood` method.
+
+
 2.7.0 - 2024-02-19
 ------------------
 
@@ -16,7 +38,7 @@ Changelog
 
 **Other changes:**
 
-- Require Python>=3.9 in line with `NEP 29 <https://numpy.org/neps/nep-0029-deprecation_policy.html#support-table>`_
+- Require Python>=3.9 in line with `NEP 29 <https://numpy.org/neps/nep-0029-deprecation_policy.html#support-table>`.
 - Build and test with Python 3.12 in CI.
 - Added line search stopping criterion for tiny loss improvements based on gradient information.
 - Added warnings about breaking changes in future versions.
@@ -73,6 +95,7 @@ Changelog
   :class:`~glum.GeneralizedLinearRegressor` and :class:`~glum.GeneralizedLinearRegressorCV`
   to ``'negative.binomial'``.
 
+
 2.4.1 - 2023-03-14
 ------------------
 
diff --git a/README.md b/README.md
index 84c9448c4..d7cd00f89 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ Why did we choose the name `glum`? We wanted a name that had the letters GLM and
 >>>
 >>> _ = model.fit(X=X, y=y)
 >>>
->>> # .report_diagnostics shows details about the steps taken by the iterative solver
+>>> # .report_diagnostics shows details about the steps taken by the iterative solver.
 >>> diags = model.get_formatted_diagnostics(full_report=True)
 >>> diags[['objective_fct']]
         objective_fct
@@ -79,6 +79,15 @@ n_iter
 3            0.443681
 4            0.443498
 5            0.443497
+>>>
+>>> # Models can also be built with formulas from formulaic.
+>>> model_formula = GeneralizedLinearRegressor(
+...     family='binomial',
+...     l1_ratio=1.0,
+...     alpha=0.001,
+...     formula="bedrooms + np.log(bathrooms + 1) + bs(sqft_living, 3) + C(waterfront)"
+... )
+>>> _ = model_formula.fit(X=house_data.data, y=y)
 
 ```
 
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 88349bea8..01109295c 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -35,7 +35,8 @@ requirements:
     - pandas
     - scikit-learn >=0.23
     - scipy
-    - tabmat >=3.1.0, <4.0.0
+    - formulaic >=0.6
+    - tabmat >=4.0.0
 
 test:
   requires:
diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
new file mode 100644
index 000000000..b131bd981
--- /dev/null
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -0,0 +1,1670 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Formula Interface Tutorial: Revisiting French Motor Third-Party Liability Claims\n",
+    "\n",
+    "\n",
+    "**Intro**\n",
+    "\n",
+    "This tutorial showcases the formula interface of `glum`. It allows for the specification of the design matrix and the response variable using so-called [Wilkinson-formulas](https://www.jstor.org/stable/2346786) instead of constructing it by hand. This kind of model specification should be familiar to R users or those who have used the `statsmodels` or `linearmodels` Python packages before. This tutorial aims to introduce the basics of working with formulas to other users, as well as highlighting some important differences between `glum`s and other packages' formula implementations.\n",
+    "\n",
+    "For a more in-depth look at how formulas work, please take a look at the [documentation of `formulaic`](https://matthewwardrop.github.io/formulaic/), the package on which `glum`'s formula interface is based.\n",
+    "\n",
+    "\n",
+    "**Background**\n",
+    "\n",
+    "This tutorial reimplements and extends the combined frequency-severity model from Chapter 4 of the [GLM tutorial](tutorials/glm_french_motor_tutorial/glm_french_motor.html). If you would like to know more about the setting, the data, or GLM modeling in general, please check that out first.\n",
+    "\n",
+    "**Sneak Peak**\n",
+    "\n",
+    "Formulas can provide a concise and convenient way to specify many of the usual pre-processing steps, such as converting to categorical types, creating interactions, applying transformations, or even spline interpolation. As an example, consider the following formula:\n",
+    "\n",
+    "```\n",
+    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3)\n",
+    "```\n",
+    "\n",
+    "Despite its brevity, it describes all of the following:\n",
+    " - The outcome variable is the ratio of `ClaimAmountCut` and `Exposure`.\n",
+    " - The predictors should include the interactions of the categorical variables `DrivAge` and `VehPower`, as well as those two variables themselves. (Even though they behave as such, neither the individual variables nor their interaction will be dummy-encoded by glum. For categoricals with many levels, this can lead to a substantial performance improvement over dummy encoding, especially for the interaction.)\n",
+    " - If there are missing values in `DrivAge`, they should be treated as a separate category.\n",
+    " - On the other hand, missing values in `VehPower` should be treated as all-zero indicators.\n",
+    " - The predictors should also include a third degree B-spline interpolation of `BonusMalus`.\n",
+    "\n",
+    "The following chapters demonstrate each of these features in some detail, as well as some additional advantages of using the formula interface."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "* [1. Load and Prepare Datasets from Openml](#1.-Load-and-Prepare-Datasets-from-Openml)\n",
+    "* [2. Reproducing the model from the GLM Tutorial](#2.-Reproducing-the-model-from-the-GLM-Tutorial)\n",
+    "* [3. Categorical Variables](#3.-Categorical-Variables)\n",
+    "* [4. Interactions and Structural Full-rankness](#4.-Interactions-and-Structural-Full-rankness)\n",
+    "* [5. Fun with Functions](#5.-Fun-with-Functions)\n",
+    "* [6. Miscellaneous Features](#6.-Miscellaneous-Features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pytest\n",
+    "import scipy.optimize as optimize\n",
+    "import scipy.stats\n",
+    "from dask_ml.preprocessing import Categorizer\n",
+    "from sklearn.metrics import mean_absolute_error\n",
+    "from sklearn.model_selection import ShuffleSplit\n",
+    "from glum import GeneralizedLinearRegressor\n",
+    "from glum import TweedieDistribution\n",
+    "\n",
+    "from load_transform_formula import load_transform"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load and Prepare Datasets from Openml<a class=\"anchor\"></a>\n",
+    "[back to table of contents](#Table-of-Contents)\n",
+    "\n",
+    "First, we load in our [dataset from openML](\"https://www.openml.org/d/41214\") and apply several transformations. In the interest of simplicity, we do not include the data loading and preparation code in this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ClaimNb</th>\n",
+       "      <th>Exposure</th>\n",
+       "      <th>Area</th>\n",
+       "      <th>VehPower</th>\n",
+       "      <th>VehAge</th>\n",
+       "      <th>DrivAge</th>\n",
+       "      <th>BonusMalus</th>\n",
+       "      <th>VehBrand</th>\n",
+       "      <th>VehGas</th>\n",
+       "      <th>Density</th>\n",
+       "      <th>Region</th>\n",
+       "      <th>ClaimAmount</th>\n",
+       "      <th>ClaimAmountCut</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>IDpol</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.10000</td>\n",
+       "      <td>D</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>1217</td>\n",
+       "      <td>R82</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.77000</td>\n",
+       "      <td>D</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>1217</td>\n",
+       "      <td>R82</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.75000</td>\n",
+       "      <td>B</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>54</td>\n",
+       "      <td>R22</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.09000</td>\n",
+       "      <td>B</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>76</td>\n",
+       "      <td>R72</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.84000</td>\n",
+       "      <td>B</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>76</td>\n",
+       "      <td>R72</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114326</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>E</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>3317</td>\n",
+       "      <td>R93</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114327</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>E</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>95</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>9850</td>\n",
+       "      <td>R11</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114328</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>D</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>1323</td>\n",
+       "      <td>R82</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114329</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>B</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>50</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Regular</td>\n",
+       "      <td>95</td>\n",
+       "      <td>R26</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6114330</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0.00274</td>\n",
+       "      <td>B</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>54</td>\n",
+       "      <td>B12</td>\n",
+       "      <td>Diesel</td>\n",
+       "      <td>65</td>\n",
+       "      <td>R72</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>678013 rows × 13 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         ClaimNb  Exposure Area  VehPower  VehAge  DrivAge  BonusMalus  \\\n",
+       "IDpol                                                                    \n",
+       "1              0   0.10000    D         5       0        5          50   \n",
+       "3              0   0.77000    D         5       0        5          50   \n",
+       "5              0   0.75000    B         6       1        5          50   \n",
+       "10             0   0.09000    B         7       0        4          50   \n",
+       "11             0   0.84000    B         7       0        4          50   \n",
+       "...          ...       ...  ...       ...     ...      ...         ...   \n",
+       "6114326        0   0.00274    E         4       0        5          50   \n",
+       "6114327        0   0.00274    E         4       0        4          95   \n",
+       "6114328        0   0.00274    D         6       1        4          50   \n",
+       "6114329        0   0.00274    B         4       0        5          50   \n",
+       "6114330        0   0.00274    B         7       1        2          54   \n",
+       "\n",
+       "        VehBrand   VehGas  Density Region  ClaimAmount  ClaimAmountCut  \n",
+       "IDpol                                                                   \n",
+       "1            B12  Regular     1217    R82          0.0             0.0  \n",
+       "3            B12  Regular     1217    R82          0.0             0.0  \n",
+       "5            B12   Diesel       54    R22          0.0             0.0  \n",
+       "10           B12   Diesel       76    R72          0.0             0.0  \n",
+       "11           B12   Diesel       76    R72          0.0             0.0  \n",
+       "...          ...      ...      ...    ...          ...             ...  \n",
+       "6114326      B12  Regular     3317    R93          0.0             0.0  \n",
+       "6114327      B12  Regular     9850    R11          0.0             0.0  \n",
+       "6114328      B12   Diesel     1323    R82          0.0             0.0  \n",
+       "6114329      B12  Regular       95    R26          0.0             0.0  \n",
+       "6114330      B12   Diesel       65    R72          0.0             0.0  \n",
+       "\n",
+       "[678013 rows x 13 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df = load_transform()\n",
+    "with pd.option_context('display.max_rows', 10):\n",
+    "    display(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Reproducing the Model From the GLM Turorial<a class=\"anchor\"></a>\n",
+    "\n",
+    "Now, let us start by fitting a very simple model. As usual, let's divide our samples into a training and a test set so that we get valid out-of-sample goodness-of-fit measures. Perhaps less usually, we do not create separate `y` and `X` data frames for our label and features – the formula will take care of that for us.\n",
+    "\n",
+    "We still have some preprocessing to do:\n",
+    " - Many of the ordinal or nominal variables are encoded as integers, instead of as categoricals. We will need to convert these so that `glum` will know to estimate a separate coefficient for each of their levels.\n",
+    " - The outcome variable is a transformation of other columns. We need to create it first.\n",
+    "\n",
+    "As we will see later on, these steps can be incorporated into the formula itself, but let's not overcomplicate things at first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=42)\n",
+    "train, test = next(ss.split(df))\n",
+    "\n",
+    "df = df.assign(PurePremium=lambda x: x[\"ClaimAmountCut\"] / x[\"Exposure\"])\n",
+    "\n",
+    "glm_categorizer = Categorizer(\n",
+    "    columns=[\"VehBrand\", \"VehGas\", \"Region\", \"Area\", \"DrivAge\", \"VehAge\", \"VehPower\"]\n",
+    ")\n",
+    "df_train = glm_categorizer.fit_transform(df.iloc[train])\n",
+    "df_test = glm_categorizer.transform(df.iloc[test])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This example demonstrates the basic idea behind formulas: the outcome variable and the predictors are separated by a tilde (`~`), and different prefictors are separated by plus signs (`+`). Thus, formulas provide a concise way of specifying a model without the need to create dataframes by hand."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>VehBrand[B1]</th>\n",
+       "      <th>VehBrand[B10]</th>\n",
+       "      <th>VehBrand[B11]</th>\n",
+       "      <th>VehBrand[B12]</th>\n",
+       "      <th>VehBrand[B13]</th>\n",
+       "      <th>VehBrand[B14]</th>\n",
+       "      <th>VehBrand[B2]</th>\n",
+       "      <th>VehBrand[B3]</th>\n",
+       "      <th>VehBrand[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>VehAge[1]</th>\n",
+       "      <th>VehAge[2]</th>\n",
+       "      <th>VehPower[4]</th>\n",
+       "      <th>VehPower[5]</th>\n",
+       "      <th>VehPower[6]</th>\n",
+       "      <th>VehPower[7]</th>\n",
+       "      <th>VehPower[8]</th>\n",
+       "      <th>VehPower[9]</th>\n",
+       "      <th>BonusMalus</th>\n",
+       "      <th>Density</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>2.88667</td>\n",
+       "      <td>-0.064157</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.231868</td>\n",
+       "      <td>-0.211061</td>\n",
+       "      <td>0.054979</td>\n",
+       "      <td>-0.270346</td>\n",
+       "      <td>-0.071453</td>\n",
+       "      <td>0.00291</td>\n",
+       "      <td>0.059324</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.008117</td>\n",
+       "      <td>-0.229906</td>\n",
+       "      <td>-0.111796</td>\n",
+       "      <td>-0.123388</td>\n",
+       "      <td>0.060757</td>\n",
+       "      <td>0.005179</td>\n",
+       "      <td>-0.021832</td>\n",
+       "      <td>0.208158</td>\n",
+       "      <td>0.032508</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 60 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  VehBrand[B1]  VehBrand[B10]  VehBrand[B11]  \\\n",
+       "coefficient    2.88667     -0.064157            0.0       0.231868   \n",
+       "\n",
+       "             VehBrand[B12]  VehBrand[B13]  VehBrand[B14]  VehBrand[B2]  \\\n",
+       "coefficient      -0.211061       0.054979      -0.270346     -0.071453   \n",
+       "\n",
+       "             VehBrand[B3]  VehBrand[B4]  ...  VehAge[1]  VehAge[2]  \\\n",
+       "coefficient       0.00291      0.059324  ...   0.008117  -0.229906   \n",
+       "\n",
+       "             VehPower[4]  VehPower[5]  VehPower[6]  VehPower[7]  VehPower[8]  \\\n",
+       "coefficient    -0.111796    -0.123388     0.060757     0.005179    -0.021832   \n",
+       "\n",
+       "             VehPower[9]  BonusMalus   Density  \n",
+       "coefficient     0.208158    0.032508  0.000002  \n",
+       "\n",
+       "[1 rows x 60 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula = \"PurePremium ~ VehBrand + VehGas + Region + Area + DrivAge + VehAge + VehPower + BonusMalus + Density\"\n",
+    "\n",
+    "TweedieDist = TweedieDistribution(1.5)\n",
+    "t_glm1 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula,\n",
+    ")\n",
+    "t_glm1.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm1.intercept_], t_glm1.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm1.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Categorical Variables<a class=\"anchor\"></a>\n",
+    "\n",
+    "`glum` also provides extensive support for categorical variables. The main function one needs to be aware of in the context of categoricals is simply called `C()`. A variable placed within it is always converted to a categorical, regardless of its type.\n",
+    "\n",
+    "A huge part of tabmat's/glum's performance advantage is that categoricals need not be one-hot encoded, but are treated as if they were. For this reason, we do not support using other coding schemes within the formula interface. If one needs to use other categorical encodings than one-hot, they can always do so manually (or even using `formulaic` directly) before the estimation.\n",
+    "\n",
+    "Let's try it out on our dataset!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ClaimNb             int64\n",
+       "Exposure          float64\n",
+       "Area               object\n",
+       "VehPower            int64\n",
+       "VehAge              int64\n",
+       "DrivAge             int64\n",
+       "BonusMalus          int64\n",
+       "VehBrand           object\n",
+       "VehGas             object\n",
+       "Density             int64\n",
+       "Region             object\n",
+       "ClaimAmount       float64\n",
+       "ClaimAmountCut    float64\n",
+       "PurePremium       float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train_noncat = df.iloc[train]\n",
+    "df_test_noncat = df.iloc[test]\n",
+    "\n",
+    "df_train_noncat.dtypes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Even though some of the variables are integers in this dataset, they are handled as categoricals thanks to the `C()` function. Strings, such as `VehBrand` or `VehGas` would have been handled as categorical by default anyway, but using the `C()` function never hurts: if applied to something that is already a caetgorical variable, it does not have any effect outside of the feature name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(VehBrand)[B1]</th>\n",
+       "      <th>C(VehBrand)[B10]</th>\n",
+       "      <th>C(VehBrand)[B11]</th>\n",
+       "      <th>C(VehBrand)[B12]</th>\n",
+       "      <th>C(VehBrand)[B13]</th>\n",
+       "      <th>C(VehBrand)[B14]</th>\n",
+       "      <th>C(VehBrand)[B2]</th>\n",
+       "      <th>C(VehBrand)[B3]</th>\n",
+       "      <th>C(VehBrand)[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>C(VehAge)[1]</th>\n",
+       "      <th>C(VehAge)[2]</th>\n",
+       "      <th>C(VehPower)[4]</th>\n",
+       "      <th>C(VehPower)[5]</th>\n",
+       "      <th>C(VehPower)[6]</th>\n",
+       "      <th>C(VehPower)[7]</th>\n",
+       "      <th>C(VehPower)[8]</th>\n",
+       "      <th>C(VehPower)[9]</th>\n",
+       "      <th>BonusMalus</th>\n",
+       "      <th>Density</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>2.88667</td>\n",
+       "      <td>-0.064157</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.231868</td>\n",
+       "      <td>-0.211061</td>\n",
+       "      <td>0.054979</td>\n",
+       "      <td>-0.270346</td>\n",
+       "      <td>-0.071453</td>\n",
+       "      <td>0.00291</td>\n",
+       "      <td>0.059324</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.008117</td>\n",
+       "      <td>-0.229906</td>\n",
+       "      <td>-0.111796</td>\n",
+       "      <td>-0.123388</td>\n",
+       "      <td>0.060757</td>\n",
+       "      <td>0.005179</td>\n",
+       "      <td>-0.021832</td>\n",
+       "      <td>0.208158</td>\n",
+       "      <td>0.032508</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 60 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(VehBrand)[B1]  C(VehBrand)[B10]  C(VehBrand)[B11]  \\\n",
+       "coefficient    2.88667        -0.064157               0.0          0.231868   \n",
+       "\n",
+       "             C(VehBrand)[B12]  C(VehBrand)[B13]  C(VehBrand)[B14]  \\\n",
+       "coefficient         -0.211061          0.054979         -0.270346   \n",
+       "\n",
+       "             C(VehBrand)[B2]  C(VehBrand)[B3]  C(VehBrand)[B4]  ...  \\\n",
+       "coefficient        -0.071453          0.00291         0.059324  ...   \n",
+       "\n",
+       "             C(VehAge)[1]  C(VehAge)[2]  C(VehPower)[4]  C(VehPower)[5]  \\\n",
+       "coefficient      0.008117     -0.229906       -0.111796       -0.123388   \n",
+       "\n",
+       "             C(VehPower)[6]  C(VehPower)[7]  C(VehPower)[8]  C(VehPower)[9]  \\\n",
+       "coefficient        0.060757        0.005179       -0.021832        0.208158   \n",
+       "\n",
+       "             BonusMalus   Density  \n",
+       "coefficient    0.032508  0.000002  \n",
+       "\n",
+       "[1 rows x 60 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_cat = (\n",
+    "    \"PurePremium ~ C(VehBrand) + C(VehGas) + C(Region) + C(Area) \"\n",
+    "    \"+ C(DrivAge) + C(VehAge) + C(VehPower) + BonusMalus + Density\"\n",
+    ")\n",
+    "\n",
+    "t_glm3 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_cat,\n",
+    ")\n",
+    "t_glm3.fit(df_train_noncat, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm3.intercept_], t_glm3.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm3.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, prediction works as expected with categorical variables. `glum` keeps track of the levels present in the training dataset, and makes sure that categorical variables in unseen datasets are also properly aligned, even if they have missing or unknown levels.<sup>3</sup> Therefore, one can simply use predict, and `glum` does The Right Thing™ by default.\n",
+    "\n",
+    "<sup>3</sup>: This is made possible due to `glum` saving a [`ModelSpec` object](https://matthewwardrop.github.io/formulaic/guides/model_specs/), which contains any information necessary for reapplying the transitions that were done during the formula materialization process. It is especially relevant in the case of [stateful transforms](https://matthewwardrop.github.io/formulaic/guides/transforms/), such as creating categorical variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([303.77443311, 548.47789523, 244.34438579, ..., 109.81572865,\n",
+       "        67.98332028, 297.21717383])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t_glm3.predict(df_test_noncat)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Interactions and Structural Full-Rankness<a class=\"anchor\"></a>\n",
+    "\n",
+    "One of the biggest strengths of Wilkinson-formuals lie in their ability of concisely specifying interactions between terms. `glum` implements this as well, and in a very efficient way: the interactions of categorical features are encoded as a new categorical feature, making it possible to interact high-cardinality categoricals with each other. If this is not possible, because, for example, a categorical is interacted with a numeric variable, sparse representations are used when appropriate. In general, just as with `glum`'s categorical handling in general, you can be assured that `glum` you don't have to worry too much about the actual implementation, and can expect that `glum` will do the most efficient thing behind the scenes.\n",
+    "\n",
+    "Let's see how that looks like on the insurance example! Suppose that we expect `VehPower` to have a different effect depending on `DrivAge` (e.g. performance cars might not be great for new drivers, but may be less problematic for more experienced ones). We can include the interaction of these variables as follows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(VehBrand)[B1]</th>\n",
+       "      <th>C(VehBrand)[B10]</th>\n",
+       "      <th>C(VehBrand)[B11]</th>\n",
+       "      <th>C(VehBrand)[B12]</th>\n",
+       "      <th>C(VehBrand)[B13]</th>\n",
+       "      <th>C(VehBrand)[B14]</th>\n",
+       "      <th>C(VehBrand)[B2]</th>\n",
+       "      <th>C(VehBrand)[B3]</th>\n",
+       "      <th>C(VehBrand)[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>C(DrivAge)[4]:C(VehPower)[8]</th>\n",
+       "      <th>C(DrivAge)[5]:C(VehPower)[8]</th>\n",
+       "      <th>C(DrivAge)[6]:C(VehPower)[8]</th>\n",
+       "      <th>C(DrivAge)[0]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[1]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[2]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[3]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[4]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[5]:C(VehPower)[9]</th>\n",
+       "      <th>C(DrivAge)[6]:C(VehPower)[9]</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>2.88023</td>\n",
+       "      <td>-0.069076</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.221037</td>\n",
+       "      <td>-0.211854</td>\n",
+       "      <td>0.052355</td>\n",
+       "      <td>-0.272058</td>\n",
+       "      <td>-0.074836</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.052523</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.147844</td>\n",
+       "      <td>-0.03567</td>\n",
+       "      <td>0.504407</td>\n",
+       "      <td>0.682528</td>\n",
+       "      <td>-0.106569</td>\n",
+       "      <td>-0.308257</td>\n",
+       "      <td>0.173206</td>\n",
+       "      <td>0.010684</td>\n",
+       "      <td>-0.220273</td>\n",
+       "      <td>0.070334</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 102 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(VehBrand)[B1]  C(VehBrand)[B10]  C(VehBrand)[B11]  \\\n",
+       "coefficient    2.88023        -0.069076               0.0          0.221037   \n",
+       "\n",
+       "             C(VehBrand)[B12]  C(VehBrand)[B13]  C(VehBrand)[B14]  \\\n",
+       "coefficient         -0.211854          0.052355         -0.272058   \n",
+       "\n",
+       "             C(VehBrand)[B2]  C(VehBrand)[B3]  C(VehBrand)[B4]  ...  \\\n",
+       "coefficient        -0.074836              0.0         0.052523  ...   \n",
+       "\n",
+       "             C(DrivAge)[4]:C(VehPower)[8]  C(DrivAge)[5]:C(VehPower)[8]  \\\n",
+       "coefficient                     -0.147844                      -0.03567   \n",
+       "\n",
+       "             C(DrivAge)[6]:C(VehPower)[8]  C(DrivAge)[0]:C(VehPower)[9]  \\\n",
+       "coefficient                      0.504407                      0.682528   \n",
+       "\n",
+       "             C(DrivAge)[1]:C(VehPower)[9]  C(DrivAge)[2]:C(VehPower)[9]  \\\n",
+       "coefficient                     -0.106569                     -0.308257   \n",
+       "\n",
+       "             C(DrivAge)[3]:C(VehPower)[9]  C(DrivAge)[4]:C(VehPower)[9]  \\\n",
+       "coefficient                      0.173206                      0.010684   \n",
+       "\n",
+       "             C(DrivAge)[5]:C(VehPower)[9]  C(DrivAge)[6]:C(VehPower)[9]  \n",
+       "coefficient                     -0.220273                      0.070334  \n",
+       "\n",
+       "[1 rows x 102 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_int = (\n",
+    "    \"PurePremium ~ C(VehBrand) + C(VehGas) + C(Region) + C(Area)\"\n",
+    "    \" + C(DrivAge) * C(VehPower) + C(VehAge) + BonusMalus + Density\"\n",
+    ")\n",
+    "\n",
+    "t_glm4 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_int,\n",
+    ")\n",
+    "t_glm4.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm4.intercept_], t_glm4.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm4.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that, in addition to the interactions, the non-interacted variants of `DrivAge` and `VehPower` are also included in the model. This is a result of using the `*` operator to interact the variables. Using `:` instead would only include the interactions, and not the marginals. (In short, `a * b` is equivalent to `a + b + a:b`.)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['C(VehPower)[4]',\n",
+       " 'C(VehPower)[5]',\n",
+       " 'C(VehPower)[6]',\n",
+       " 'C(VehPower)[7]',\n",
+       " 'C(VehPower)[8]',\n",
+       " 'C(VehPower)[9]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[4]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[5]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[6]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[7]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[8]',\n",
+       " 'C(DrivAge)[0]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[1]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[2]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[3]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[4]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[5]:C(VehPower)[9]',\n",
+       " 'C(DrivAge)[6]:C(VehPower)[9]']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[name for name in t_glm4.feature_names_ if \"VehPower\" in name]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The attentive reader might have also noticed that the first level of each categorical variable is omitted from the model. This is a manifestation of the more general concept of [ensuring structural full-rankedness](https://matthewwardrop.github.io/formulaic/guides/contrasts/#guaranteeing-structural-full-rankness)<sup>4</sup>. By default, `glum` and `formulaic` will try to make sure that one does not fall into the [Dummy Variable Trap](https://en.wikipedia.org/wiki/Dummy_variable_(statistics)). Moreover, it even does it in the case of (possibly multi-way) interactions involving categorical variables. It will always drop the necessary number of levels, and no more. If you want to opt out of this behavior (for example because you would like to penalize all levels equally), simply set the `drop_first` parameter during model initialization to `False`. If one only aims to include all levels of a certain variable, and not others, it is possible to do so by using the `spans_intercept` parameter (e.g. `C(VehPower, spans_intercept=False)` would include all levels of `VehPower` even if `drop_first` is set to `True`).\n",
+    "\n",
+    "<sup>4</sup>: Note, that it does not guarantee that the design matrix is actually full rank. For example, two identical numerical variables will still lead to a rank-deficient design matrix."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Fun with Functions<a class=\"anchor\"></a>\n",
+    "\n",
+    "The previous example is only scratching the surface of what formulas are capable of. For example, they are capable of evaluating arbitrary Python expressions, which act as if they saw the columns of the input data frame as local variables (`pandas.Series`). The way to tell `glum` that a part of the formula should be evaluated as a Python expression before applying the formula grammar to it is to enclose it in curly braces. As an example, we can easily do the following within the formula itself:\n",
+    "\n",
+    " 1. Create the outcome variable on the fly instead of doing it beforehand.\n",
+    " 2. Include the logarithm of a certain variable in the model.\n",
+    " 3. Include a basis spline interpolation of a variable to capture non-linearities in its effect.\n",
+    "\n",
+    "1\\. works because because formulas can contain [Python operations](https://matthewwardrop.github.io/formulaic/guides/grammar/). 2. and 3. work because formulas are evaluated within a context that is aware of a number of [transforms](https://matthewwardrop.github.io/formulaic/guides/transforms/). To be precise, 2. is a regular transform and 3. is a stateful transform.\n",
+    "\n",
+    "Let's try it out!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>VehBrand[B1]</th>\n",
+       "      <th>VehBrand[B10]</th>\n",
+       "      <th>VehBrand[B11]</th>\n",
+       "      <th>VehBrand[B12]</th>\n",
+       "      <th>VehBrand[B13]</th>\n",
+       "      <th>VehBrand[B14]</th>\n",
+       "      <th>VehBrand[B2]</th>\n",
+       "      <th>VehBrand[B3]</th>\n",
+       "      <th>VehBrand[B4]</th>\n",
+       "      <th>...</th>\n",
+       "      <th>VehPower[4]</th>\n",
+       "      <th>VehPower[5]</th>\n",
+       "      <th>VehPower[6]</th>\n",
+       "      <th>VehPower[7]</th>\n",
+       "      <th>VehPower[8]</th>\n",
+       "      <th>VehPower[9]</th>\n",
+       "      <th>bs(BonusMalus, 3)[1]</th>\n",
+       "      <th>bs(BonusMalus, 3)[2]</th>\n",
+       "      <th>bs(BonusMalus, 3)[3]</th>\n",
+       "      <th>np.log(Density)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>3.808829</td>\n",
+       "      <td>-0.060201</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.242194</td>\n",
+       "      <td>-0.202517</td>\n",
+       "      <td>0.063471</td>\n",
+       "      <td>-0.345415</td>\n",
+       "      <td>-0.072546</td>\n",
+       "      <td>0.00777</td>\n",
+       "      <td>0.079391</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.113038</td>\n",
+       "      <td>-0.127255</td>\n",
+       "      <td>0.060209</td>\n",
+       "      <td>0.005577</td>\n",
+       "      <td>-0.032114</td>\n",
+       "      <td>0.207355</td>\n",
+       "      <td>3.178178</td>\n",
+       "      <td>0.361951</td>\n",
+       "      <td>8.231846</td>\n",
+       "      <td>0.121944</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 62 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  VehBrand[B1]  VehBrand[B10]  VehBrand[B11]  \\\n",
+       "coefficient   3.808829     -0.060201            0.0       0.242194   \n",
+       "\n",
+       "             VehBrand[B12]  VehBrand[B13]  VehBrand[B14]  VehBrand[B2]  \\\n",
+       "coefficient      -0.202517       0.063471      -0.345415     -0.072546   \n",
+       "\n",
+       "             VehBrand[B3]  VehBrand[B4]  ...  VehPower[4]  VehPower[5]  \\\n",
+       "coefficient       0.00777      0.079391  ...    -0.113038    -0.127255   \n",
+       "\n",
+       "             VehPower[6]  VehPower[7]  VehPower[8]  VehPower[9]  \\\n",
+       "coefficient     0.060209     0.005577    -0.032114     0.207355   \n",
+       "\n",
+       "             bs(BonusMalus, 3)[1]  bs(BonusMalus, 3)[2]  bs(BonusMalus, 3)[3]  \\\n",
+       "coefficient              3.178178              0.361951              8.231846   \n",
+       "\n",
+       "             np.log(Density)  \n",
+       "coefficient         0.121944  \n",
+       "\n",
+       "[1 rows x 62 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_fun = (\n",
+    "    \"{ClaimAmountCut / Exposure} ~ VehBrand + VehGas + Region + Area\"\n",
+    "    \" + DrivAge + VehAge + VehPower + bs(BonusMalus, 3) + np.log(Density)\"\n",
+    ")\n",
+    "\n",
+    "t_glm5 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_fun,\n",
+    ")\n",
+    "t_glm5.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm5.intercept_], t_glm5.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm5.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To allow for even more flexibility, you can add custom transformations that are defined in the context from which the call is made. E.g., we can define a transformation that takes the logarithm of ``VehAge + 1`` after casting it to numeric. To make the formula recognize this transform, you need to explicitly set ``context=0`` when calling the fit method (note that this differs from ``formulaic``'s default, which is already ``context=0``)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>_log_plus_one(VehAge)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>5.046712</td>\n",
+       "      <td>-0.151043</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  _log_plus_one(VehAge)\n",
+       "coefficient   5.046712              -0.151043"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def _log_plus_one(x):\n",
+    "    return np.log(pd.to_numeric(x) + 1)\n",
+    "\n",
+    "formula_custom_fun = (\n",
+    "    \"{ClaimAmountCut / Exposure} ~ _log_plus_one(VehAge)\"\n",
+    ")\n",
+    "\n",
+    "t_glm6 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_custom_fun,\n",
+    ")\n",
+    "t_glm6.fit(df_train, sample_weight=df[\"Exposure\"].values[train], context=0)\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm6.intercept_], t_glm6.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm6.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Miscellaneous Features<a class=\"anchor\"></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Variable Names\n",
+    "\n",
+    "`glum`'s formula interface provides a lot of control over how the resulting features are named. By default, it follows `formulaic`'s standards, but it can be customized by setting the `interaction_separator` and `categorical_format` paremeters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>DrivAge__0</th>\n",
+       "      <th>DrivAge__1</th>\n",
+       "      <th>DrivAge__2</th>\n",
+       "      <th>DrivAge__3</th>\n",
+       "      <th>DrivAge__4</th>\n",
+       "      <th>DrivAge__5</th>\n",
+       "      <th>DrivAge__6</th>\n",
+       "      <th>VehPower__4</th>\n",
+       "      <th>VehPower__5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DrivAge__4__x__VehPower__8</th>\n",
+       "      <th>DrivAge__5__x__VehPower__8</th>\n",
+       "      <th>DrivAge__6__x__VehPower__8</th>\n",
+       "      <th>DrivAge__0__x__VehPower__9</th>\n",
+       "      <th>DrivAge__1__x__VehPower__9</th>\n",
+       "      <th>DrivAge__2__x__VehPower__9</th>\n",
+       "      <th>DrivAge__3__x__VehPower__9</th>\n",
+       "      <th>DrivAge__4__x__VehPower__9</th>\n",
+       "      <th>DrivAge__5__x__VehPower__9</th>\n",
+       "      <th>DrivAge__6__x__VehPower__9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>5.007277</td>\n",
+       "      <td>1.497079</td>\n",
+       "      <td>0.53565</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>-0.152974</td>\n",
+       "      <td>-0.210998</td>\n",
+       "      <td>-0.205689</td>\n",
+       "      <td>0.017896</td>\n",
+       "      <td>-0.096153</td>\n",
+       "      <td>-0.05484</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.143822</td>\n",
+       "      <td>-0.002094</td>\n",
+       "      <td>0.512258</td>\n",
+       "      <td>0.730534</td>\n",
+       "      <td>-0.280869</td>\n",
+       "      <td>-0.367669</td>\n",
+       "      <td>0.171063</td>\n",
+       "      <td>0.022052</td>\n",
+       "      <td>-0.270456</td>\n",
+       "      <td>0.119634</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
+       "coefficient   5.007277    1.497079     0.53565         0.0   -0.152974   \n",
+       "\n",
+       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
+       "coefficient   -0.210998   -0.205689    0.017896    -0.096153     -0.05484   \n",
+       "\n",
+       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
+       "coefficient  ...                   -0.143822                   -0.002094   \n",
+       "\n",
+       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
+       "coefficient                    0.512258                    0.730534   \n",
+       "\n",
+       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
+       "coefficient                   -0.280869                   -0.367669   \n",
+       "\n",
+       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
+       "coefficient                    0.171063                    0.022052   \n",
+       "\n",
+       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
+       "coefficient                   -0.270456                    0.119634  \n",
+       "\n",
+       "[1 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_name = \"PurePremium ~ DrivAge * VehPower\"\n",
+    "\n",
+    "t_glm7 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=True,\n",
+    "    formula=formula_name,\n",
+    "    interaction_separator=\"__x__\",\n",
+    "    categorical_format=\"{name}__{category}\",\n",
+    ")\n",
+    "t_glm7.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm7.intercept_], t_glm7.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm7.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Intercept Term\n",
+    "\n",
+    "Just like in the case of the non-formula interface, the presence of an intercept is determined by the `fit_intercept` argument. In case that the formula specifies a different behavior (e.g., adding `+0` or `-1` while `fit_intercept=True`), an error will be raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "formula_noint = \"PurePremium ~ DrivAge * VehPower - 1\"\n",
+    "\n",
+    "with pytest.raises(ValueError, match=\"The formula sets the intercept to False\"):\n",
+    "    t_glm8 = GeneralizedLinearRegressor(\n",
+    "        family=TweedieDist,\n",
+    "        alpha_search=True,\n",
+    "        l1_ratio=1,\n",
+    "        fit_intercept=True,\n",
+    "        formula=formula_noint,\n",
+    "        interaction_separator=\"__x__\",\n",
+    "        categorical_format=\"{name}__{category}\",\n",
+    "    ).fit(df_train, sample_weight=df[\"Exposure\"].values[train])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### One-Sided Formulas\n",
+    "\n",
+    "Even when using formulas, the outcome variable can be specified as a vector, as in the interface without formulas. In that case the supplied formula should be one-sided (not contain a `~`), and only describe the right-hand side of the regression."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>DrivAge__0</th>\n",
+       "      <th>DrivAge__1</th>\n",
+       "      <th>DrivAge__2</th>\n",
+       "      <th>DrivAge__3</th>\n",
+       "      <th>DrivAge__4</th>\n",
+       "      <th>DrivAge__5</th>\n",
+       "      <th>DrivAge__6</th>\n",
+       "      <th>VehPower__4</th>\n",
+       "      <th>VehPower__5</th>\n",
+       "      <th>...</th>\n",
+       "      <th>DrivAge__4__x__VehPower__8</th>\n",
+       "      <th>DrivAge__5__x__VehPower__8</th>\n",
+       "      <th>DrivAge__6__x__VehPower__8</th>\n",
+       "      <th>DrivAge__0__x__VehPower__9</th>\n",
+       "      <th>DrivAge__1__x__VehPower__9</th>\n",
+       "      <th>DrivAge__2__x__VehPower__9</th>\n",
+       "      <th>DrivAge__3__x__VehPower__9</th>\n",
+       "      <th>DrivAge__4__x__VehPower__9</th>\n",
+       "      <th>DrivAge__5__x__VehPower__9</th>\n",
+       "      <th>DrivAge__6__x__VehPower__9</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.713298</td>\n",
+       "      <td>0.783505</td>\n",
+       "      <td>0.205914</td>\n",
+       "      <td>0.016085</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000094</td>\n",
+       "      <td>0.223685</td>\n",
+       "      <td>4.66123</td>\n",
+       "      <td>4.736272</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.144927</td>\n",
+       "      <td>0.001657</td>\n",
+       "      <td>0.515373</td>\n",
+       "      <td>0.714834</td>\n",
+       "      <td>-0.325666</td>\n",
+       "      <td>-0.370935</td>\n",
+       "      <td>0.20417</td>\n",
+       "      <td>0.013222</td>\n",
+       "      <td>-0.273913</td>\n",
+       "      <td>0.115693</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 56 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
+       "coefficient        0.0    1.713298    0.783505    0.205914    0.016085   \n",
+       "\n",
+       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
+       "coefficient         0.0    0.000094    0.223685      4.66123     4.736272   \n",
+       "\n",
+       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
+       "coefficient  ...                   -0.144927                    0.001657   \n",
+       "\n",
+       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
+       "coefficient                    0.515373                    0.714834   \n",
+       "\n",
+       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
+       "coefficient                   -0.325666                   -0.370935   \n",
+       "\n",
+       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
+       "coefficient                     0.20417                    0.013222   \n",
+       "\n",
+       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
+       "coefficient                   -0.273913                    0.115693  \n",
+       "\n",
+       "[1 rows x 56 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_onesie = \"DrivAge * VehPower\"\n",
+    "\n",
+    "t_glm8 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=False,\n",
+    "    formula=formula_onesie,\n",
+    "    interaction_separator=\"__x__\",\n",
+    "    categorical_format=\"{name}__{category}\",\n",
+    ")\n",
+    "t_glm8.fit(\n",
+    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
+    ")\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
+    ").T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Values in Categorical Columns\n",
+    "\n",
+    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n",
+    "\n",
+    "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>intercept</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[0]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[1]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[2]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[3]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[4]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[5]</th>\n",
+       "      <th>C(DrivAge, missing_method='zero')[6]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[4]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[5]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[6]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[7]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[8]</th>\n",
+       "      <th>C(VehPower, missing_method='convert')[9]</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>coefficient</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.786703</td>\n",
+       "      <td>0.742765</td>\n",
+       "      <td>0.239528</td>\n",
+       "      <td>0.096531</td>\n",
+       "      <td>0.071118</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.201078</td>\n",
+       "      <td>4.637267</td>\n",
+       "      <td>4.679391</td>\n",
+       "      <td>4.863387</td>\n",
+       "      <td>4.77263</td>\n",
+       "      <td>4.749673</td>\n",
+       "      <td>4.970188</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             intercept  C(DrivAge, missing_method='zero')[0]  \\\n",
+       "coefficient        0.0                              1.786703   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[1]  \\\n",
+       "coefficient                              0.742765   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[2]  \\\n",
+       "coefficient                              0.239528   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[3]  \\\n",
+       "coefficient                              0.096531   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[4]  \\\n",
+       "coefficient                              0.071118   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[5]  \\\n",
+       "coefficient                                   0.0   \n",
+       "\n",
+       "             C(DrivAge, missing_method='zero')[6]  \\\n",
+       "coefficient                              0.201078   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[4]  \\\n",
+       "coefficient                                  4.637267   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[5]  \\\n",
+       "coefficient                                  4.679391   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[6]  \\\n",
+       "coefficient                                  4.863387   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[7]  \\\n",
+       "coefficient                                   4.77263   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[8]  \\\n",
+       "coefficient                                  4.749673   \n",
+       "\n",
+       "             C(VehPower, missing_method='convert')[9]  \n",
+       "coefficient                                  4.970188  "
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n",
+    "\n",
+    "t_glm9 = GeneralizedLinearRegressor(\n",
+    "    family=TweedieDist,\n",
+    "    alpha_search=True,\n",
+    "    l1_ratio=1,\n",
+    "    fit_intercept=False,\n",
+    "    formula=formula_missing,\n",
+    "\n",
+    ")\n",
+    "t_glm9.fit(\n",
+    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
+    ")\n",
+    "\n",
+    "pd.DataFrame(\n",
+    "    {\"coefficient\": np.concatenate(([t_glm9.intercept_], t_glm9.coef_))},\n",
+    "    index=[\"intercept\"] + t_glm9.feature_names_,\n",
+    ").T"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "glum-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/tutorials/formula_interface/load_transform_formula.py b/docs/tutorials/formula_interface/load_transform_formula.py
new file mode 100644
index 000000000..d0098f8b7
--- /dev/null
+++ b/docs/tutorials/formula_interface/load_transform_formula.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pandas as pd
+
+
+def load_transform():
+    """Load and transform data from OpenML.
+
+    Summary of transformations:
+
+    1. We cut the number of claims to a maximum of 4, as is done in the case study paper
+       (Case-study authors suspect a data error. See section 1 of their paper for details).
+    2. We cut the exposure to a maximum of 1, as is done in the case study paper
+       (Case-study authors suspect a data error. See section 1 of their paper for details).
+    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim
+       (before aggregation per policy). Reason: For large claims, extreme value theory
+       might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of
+       the overall claim amount. This is a well known phenomenon for third-party liability.
+    4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``.
+    5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero.
+    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so
+       they can be used as categoricals later on.
+    """
+    # load the datasets
+    # first row (=column names) uses "", all other rows use ''
+    # use '' as quotechar as it is easier to change column names
+    df = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.arff", quotechar="'"
+    )
+
+    # rename column names '"name"' => 'name'
+    df = df.rename(lambda x: x.replace('"', ""), axis="columns")
+    df["IDpol"] = df["IDpol"].astype(np.int64)
+    df.set_index("IDpol", inplace=True)
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", index_col=0
+    )
+
+    # join ClaimAmount from df_sev to df:
+    #   1. cut ClaimAmount at 100_000
+    #   2. aggregate ClaimAmount per IDpol
+    #   3. join by IDpol
+    df_sev["ClaimAmountCut"] = df_sev["ClaimAmount"].clip(upper=100_000)
+    df = df.join(df_sev.groupby(level=0).sum(), how="left")
+    df.fillna(value={"ClaimAmount": 0, "ClaimAmountCut": 0}, inplace=True)
+
+    # Note: Zero claims must be ignored in severity models,
+    # because the support is (0, inf) not [0, inf).
+    df.loc[(df.ClaimAmount <= 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+    # correct for unreasonable observations (that might be data error)
+    # see case study paper
+    df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+    df["Exposure"] = df["Exposure"].clip(upper=1)
+
+    # Clip and/or digitize predictors into bins
+    df["VehPower"] = np.minimum(df["VehPower"], 9)
+    df["VehAge"] = np.digitize(
+        np.where(df["VehAge"] == 10, 9, df["VehAge"]), bins=[1, 10]
+    )
+    df["DrivAge"] = np.digitize(df["DrivAge"], bins=[21, 26, 31, 41, 51, 71])
+
+    return df
diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst
index 66ce7d452..86166d174 100644
--- a/docs/tutorials/tutorials.rst
+++ b/docs/tutorials/tutorials.rst
@@ -7,3 +7,4 @@ Tutorials
    Poisson, Gamma, and Tweedie with French Motor Third-Party Liability Claims <glm_french_motor_tutorial/glm_french_motor.ipynb>
    High Dimensional Fixed Effects with Rossman Sales <rossman/fixed_effects_rossman.ipynb>
    Regularization with King County Housing Sales <regularization_housing_data/regularization_housing.ipynb>
+   Formula interface <formula_interface/formula_interface.ipynb>
diff --git a/environment-macos.yml b/environment-macos.yml
new file mode 100644
index 000000000..ad813f7f8
--- /dev/null
+++ b/environment-macos.yml
@@ -0,0 +1,51 @@
+name: glum
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  # required for users (note: this is not where you specify new dependencies
+  # for the conda packages. please put those `conda.recipe/meta.yaml`!!
+  - numexpr
+  - pandas>=0.21
+  - tabmat>=4.0.0
+  - scikit-learn>=0.23
+  - scipy
+  - tqdm
+  - formulaic>=0.6
+
+  # development tools
+  - black
+  - flake8
+  - git_root
+  - ipdb
+  - ipython
+  - line_profiler
+  - memory_profiler
+  - pip
+  - pre-commit
+  - pyarrow
+  - pytest
+  - pytest-xdist
+  - setuptools_scm
+
+  # build tools
+  - c-compiler
+  - cxx-compiler
+  - cython
+
+  # required for tests
+  - statsmodels
+
+  # documentation dev
+  - jinja2
+  - jupyterlab
+  - jupytext
+  - make
+  - matplotlib-base
+  - nbclassic>=0.2.8
+  - nbsphinx>=0.8.3
+  - sphinx>=3.5.3
+  - sphinx_rtd_theme
+  - sphinxcontrib-apidoc
+  - sphinxext-altair
+
diff --git a/environment.yml b/environment.yml
index 198d8446a..5de795687 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,10 +8,11 @@ dependencies:
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
   - pandas>=0.21
-  - tabmat>=3.1.0, <4.0.0
+  - tabmat>=4.0.0
   - scikit-learn>=0.23
   - scipy
   - tqdm
+  - formulaic>=0.6
 
   # development tools
   - black
diff --git a/pyproject.toml b/pyproject.toml
index a69a904a4..fc6dc0a34 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ default_section = 'THIRDPARTY'
 skip = [
   "*-win32",
   "*-manylinux_i686",
-  "pp*", 
+  "pp*",
   "*-musllinux_*",
   "cp36*",
   "cp37*",
diff --git a/setup.py b/setup.py
index 67dc24d03..0b061cc80 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,8 @@
         "pandas",
         "scikit-learn>=0.23",
         "scipy",
-        "tabmat>=3.1.0,<4.0.0",
+        "formulaic>=0.6",
+        "tabmat>=4.0.0",
     ],
     entry_points=(
         None
diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
index 6b89a0c02..f7facbd57 100644
--- a/src/glum/_distribution.py
+++ b/src/glum/_distribution.py
@@ -4,14 +4,8 @@
 
 import numexpr
 import numpy as np
-from scipy import sparse, special
-from tabmat import (
-    CategoricalMatrix,
-    DenseMatrix,
-    MatrixBase,
-    SplitMatrix,
-    StandardizedMatrix,
-)
+from scipy import special
+from tabmat import MatrixBase, StandardizedMatrix, hstack
 
 from ._functions import (
     binomial_logit_eta_mu_deviance,
@@ -20,6 +14,10 @@
     gamma_log_eta_mu_deviance,
     gamma_log_likelihood,
     gamma_log_rowwise_gradient_hessian,
+    inv_gaussian_deviance,
+    inv_gaussian_log_eta_mu_deviance,
+    inv_gaussian_log_likelihood,
+    inv_gaussian_log_rowwise_gradient_hessian,
     negative_binomial_deviance,
     negative_binomial_log_eta_mu_deviance,
     negative_binomial_log_likelihood,
@@ -44,45 +42,23 @@
 class ExponentialDispersionModel(metaclass=ABCMeta):
     r"""Base class for reproductive Exponential Dispersion Models (EDM).
 
-    The PDF of :math:`Y \sim \mathrm{EDM}(\mu, \phi)` is given by
+    The PDF of :math:`Y \sim \mathrm{EDM}(\theta, \phi)` is given by
 
     .. math::
 
         p(y \mid \theta, \phi)
-        &= c(y, \phi) \exp((\theta y - A(\theta)_ / \phi) \\
-        &= \tilde{c}(y, \phi) \exp(-d(y, \mu) / (2\phi))
+        &= \exp \left(\frac{y \theta - b(\theta)}{\phi / w} + c(y; w / \phi) \right),
 
-    with mean :math:`\mathrm{E}(Y) = A'(\theta) = \mu`, variance
-    :math:`\mathrm{var}(Y) = \phi \cdot v(\mu)`, unit variance
-    :math:`v(\mu)` and unit deviance :math:`d(y, \mu)`.
+    where :math:`\theta` is the scale parameter, :math:`\phi` is the dispersion
+    parameter, :math:`w` is a given weight, :math:`b` is the cumulant function
+    and :math:`c` is a normalization term.
 
-    Properties
-    ----------
-    lower_bound
-    upper_bound
-    include_lower_bound
-    include_upper_bound
-
-    Methods
-    -------
-    in_y_range
-    unit_variance
-    unit_variance_derivative
-    variance
-    variance_derivative
-    unit_deviance
-    unit_deviance_derivative
-    deviance
-    deviance_derivative
-    starting_mu
-
-    _mu_deviance_derivative
-    eta_mu_deviance
-    gradient_hessian
+    It can be shown that :math:`\mathrm{E}(Y) = b'(\theta)` and
+    :math:`\mathrm{var}(Y) = b''(\theta) \times \phi / w`.
 
     References
     ----------
-    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    < https://en.wikipedia.org/wiki/Exponential_dispersion_model >.
     """
 
     @property
@@ -110,51 +86,41 @@ def include_upper_bound(self) -> bool:
         pass
 
     def in_y_range(self, x) -> np.ndarray:
-        """Return ``True`` if ``x`` is in the valid range of the EDM.
-
-        Parameters
-        ----------
-        x : array-like, shape (n_samples,)
-            Target values.
-
-        Returns
-        -------
-        np.ndarray
-        """
+        """Return ``True`` if ``x`` is in the valid range of the EDM."""
         if self.include_lower_bound:
-            if self.include_upper_bound:
-                return np.logical_and(
-                    np.greater_equal(x, self.lower_bound),
-                    np.less_equal(x, self.upper_bound),
-                )
-            else:
-                return np.logical_and(
-                    np.greater_equal(x, self.lower_bound), np.less(x, self.upper_bound)
-                )
+            lb_op = np.greater_equal
         else:
-            if self.include_upper_bound:
-                return np.logical_and(
-                    np.greater(x, self.lower_bound), np.less_equal(x, self.upper_bound)
-                )
-            else:
-                return np.logical_and(
-                    np.greater(x, self.lower_bound), np.less(x, self.upper_bound)
-                )
+            lb_op = np.greater
+
+        if self.include_upper_bound:
+            ub_op = np.less_equal
+        else:
+            ub_op = np.less
+
+        return lb_op(x, self.lower_bound) & ub_op(x, self.upper_bound)
+
+    def to_tweedie(self, safe=True):
+        """Return the Tweedie representation of a distribution if it exists."""
+        if hasattr(self, "__tweedie_repr__"):
+            return self.__tweedie_repr__()
+        if safe:
+            raise ValueError("This distribution has no Tweedie representation.")
+        return None
 
     @abstractmethod
     def unit_variance(self, mu):
-        r"""Compute the unit variance function.
+        r"""Compute the unit variance.
 
-        The unit variance :math:`v(\mu)` determines the variance as a function
-        of the mean :math:`\mu` by
-        :math:`\mathrm{var}(y_i) = (\phi / s_i) \times v(\mu_i)`. It can
-        also be derived from the unit deviance :math:`d(y, \mu)` as
+        The unit variance, :math:`v(\mu) \equiv b''((b')^{-1} (\mu))`,
+        determines the variance as a function of the mean :math:`\mu` by
+        :math:`\mathrm{var}(y_i) = v(\mu_i) \times \phi / w_i`. It can also be
+        derived from the unit deviance :math:`d(y, \mu)` as
 
         .. math::
 
-            v(\mu) = \frac{2}{\frac{\partial^2 d(y, \mu)}{\partial\mu^2}}\big|_{y=\mu}.
+            v(\mu) = 2 \div \frac{\partial^2 d(y, \mu)}{\partial\mu^2} \big| _{y=\mu}.
 
-        See also :func:`variance`.
+        See also :meth:`~ExponentialDispersionModel.variance`.
 
         Parameters
         ----------
@@ -167,8 +133,6 @@ def unit_variance(self, mu):
     def unit_variance_derivative(self, mu):
         r"""Compute the derivative of the unit variance with respect to ``mu``.
 
-        Return :math:`v'(\mu)`.
-
         Parameters
         ----------
         mu : array-like, shape (n_samples,)
@@ -176,12 +140,12 @@ def unit_variance_derivative(self, mu):
         """
         pass
 
-    def variance(self, mu: np.ndarray, dispersion=1, sample_weight=1) -> np.ndarray:
+    def variance(self, mu, dispersion=1, sample_weight=1) -> np.ndarray:
         r"""Compute the variance function.
 
-        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i, \phi / s_i)` is
-        :math:`\mathrm{var}(Y_i) = (\phi / s_i) * v(\mu_i)`, with unit variance
-        :math:`v(\mu)` and weights :math:`s_i`.
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i, \phi / w_i)` takes
+        the form :math:`v(\mu_i) \times \phi / w_i`, where :math:`v(\mu)` is the
+        unit variance and :math:`w_i` are weights.
 
         Parameters
         ----------
@@ -204,8 +168,8 @@ def variance_derivative(self, mu, dispersion=1, sample_weight=1):
         r"""Compute the derivative of the variance with respect to ``mu``.
 
         The derivative of the variance is equal to
-        :math:`(\phi / s_i) * v'(\mu_i)`, where :math:`v(\mu)` is the unit
-        variance and :math:`s_i` are weights.
+        :math:`v(\mu_i) \times \phi / w_i`, where :math:`v(\mu)` is the unit
+        variance and :math:`ws_i` are weights.
 
         Parameters
         ----------
@@ -228,8 +192,10 @@ def variance_derivative(self, mu, dispersion=1, sample_weight=1):
     def unit_deviance(self, y, mu):
         r"""Compute the unit deviance.
 
-        In terms of the log likelihood :math:`L`, the unit deviance is
-        :math:`-2\phi\times [L(y, \mu, \phi) - L(y, y, \phi)].`
+        In terms of the unit log likelihood :math:`\ell`, the unit deviance is
+        :math:`2 [\ell(y_i, y_i, \phi) - \ell(y_i, \mu, \phi)]`, i.e. twice the
+        difference between the log likelihood of a saturated model (with one
+        parameter per observation) and the model at hand.
 
         Parameters
         ----------
@@ -245,7 +211,7 @@ def unit_deviance_derivative(self, y, mu):
         r"""Compute the derivative of the unit deviance with respect to ``mu``.
 
         The derivative of the unit deviance is given by
-        :math:`-2 \times (y - \mu) / v(\mu)`, where :math:`v(\mu)` is the unit
+        :math:`2 \times (\mu - y) / v(\mu)`, where :math:`v(\mu)` is the unit
         variance.
 
         Parameters
@@ -262,13 +228,14 @@ def unit_deviance_derivative(self, y, mu):
         """
         return -2 * (y - mu) / self.unit_variance(mu)
 
-    def deviance(self, y, mu, sample_weight=1):
+    def deviance(self, y, mu, sample_weight=1) -> float:
         r"""Compute the deviance.
 
-        The deviance is a weighted sum of the unit deviances,
-        :math:`\sum_i s_i \times d(y_i, \mu_i)`, where :math:`d(y, \mu)` is the
-        unit deviance and :math:`s` are weights. In terms of the log likelihood,
-        it is :math:`-2\phi \times [L(y, \mu, \phi / s) - L(y, y, \phi / s)]`.
+        The deviance is a weighted sum of the unit deviances. In terms of the
+        unit log likelihood :math:`\ell`, it equals
+        :math:`2 \sum_i [\ell(y_i, y_i, \phi) - \ell(y_i, \mu, \phi)]`,
+        i.e. twice the difference between the log likelihood of a saturated
+        model (with one parameter per observation) and the model at hand.
 
         Parameters
         ----------
@@ -279,11 +246,7 @@ def deviance(self, y, mu, sample_weight=1):
             Predicted mean.
 
         sample_weight : array-like, shape (n_samples,), optional (default=1)
-            Weights or exposure to which variance is inversely proportional.
-
-        Returns
-        -------
-        float
+            Weights or exposure to which the variance is inversely proportional.
         """
         if sample_weight is None:
             return np.sum(self.unit_deviance(y, mu))
@@ -312,46 +275,41 @@ def deviance_derivative(self, y, mu, sample_weight=1):
 
     def _mu_deviance_derivative(
         self,
-        coef: np.ndarray,
-        X,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
+        coef,
+        X: Union[MatrixBase, StandardizedMatrix],
+        y,
+        sample_weight,
         link: Link,
-        offset: np.ndarray = None,
-    ) -> tuple[np.ndarray, np.ndarray]:
-        """Compute ``mu`` and the derivative of the deviance \
-            with respect to coefficients."""
+        offset=None,
+    ):
+        """Compute ``mu`` and the derivative of the deviance with respect to coefficients."""
         lin_pred = _safe_lin_pred(X, coef, offset)
         mu = link.inverse(lin_pred)
         d1 = link.inverse_derivative(lin_pred)
         temp = d1 * self.deviance_derivative(y, mu, sample_weight)
+
         if coef.size == X.shape[1] + 1:
             devp = np.concatenate(([temp.sum()], temp @ X))
         else:
             devp = temp @ X  # same as X.T @ temp
+
         return mu, devp
 
     def eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-    ):
-        """
-        Compute ``eta``, ``mu`` and the deviance.
-
-        Compute:
-        * the linear predictor, ``eta``, as ``cur_eta + factor * X_dot_d``;
-        * the link-function-transformed prediction, ``mu``;
-        * the deviance.
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+    ) -> tuple[np.ndarray, np.ndarray, float]:
+        """Compute ``eta``, ``mu`` and the deviance.
 
         Returns
         -------
         numpy.ndarray, shape (X.shape[0],)
-            The linear predictor, ``eta``.
+            The linear predictor, ``eta``, as ``cur_eta + factor * X_dot_d``.
         numpy.ndarray, shape (X.shape[0],)
             The link-function-transformed prediction, ``mu``.
         float
@@ -361,24 +319,25 @@ def eta_mu_deviance(
         # avoiding allocating new arrays for every line search loop
         eta_out = np.empty_like(cur_eta)
         mu_out = np.empty_like(cur_eta)
+
         deviance = self._eta_mu_deviance(
             link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
         )
+
         return eta_out, mu_out, deviance
 
     def _eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta_out: np.ndarray,
-        mu_out: np.ndarray,
-    ):
-        """
-        Update ``eta`` and ``mu`` and compute the deviance.
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ) -> float:
+        """Update ``eta`` and ``mu`` and compute the deviance.
 
         This is a default implementation that should work for all valid
         distributions and link functions. To implement a custom optimized
@@ -388,6 +347,7 @@ def _eta_mu_deviance(
         Returns
         -------
         float
+            The deviance.
         """
         eta_out[:] = cur_eta + factor * X_dot_d
         mu_out[:] = link.inverse(eta_out)
@@ -396,17 +356,16 @@ def _eta_mu_deviance(
     def rowwise_gradient_hessian(
         self,
         link: Link,
-        coef: np.ndarray,
+        coef,
         dispersion,
         X: Union[MatrixBase, StandardizedMatrix],
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta: np.ndarray,
-        mu: np.ndarray,
-        offset: np.ndarray = None,
+        y,
+        sample_weight,
+        eta,
+        mu,
+        offset=None,
     ):
-        """
-        Compute the gradient and negative Hessian of the log likelihood row-wise.
+        """Compute the gradient and negative Hessian of the log likelihood row-wise.
 
         Returns
         -------
@@ -417,6 +376,7 @@ def rowwise_gradient_hessian(
         """
         gradient_rows = np.empty_like(mu)
         hessian_rows = np.empty_like(mu)
+
         self._rowwise_gradient_hessian(
             link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
         )
@@ -428,8 +388,7 @@ def rowwise_gradient_hessian(
     def _rowwise_gradient_hessian(
         self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
     ):
-        """
-        Update ``gradient_rows`` and ``hessian_rows`` in place.
+        """Update ``gradient_rows`` and ``hessian_rows`` in place.
 
         This is a default implementation that should work for all valid
         distributions and link functions. To implement a custom optimized
@@ -532,19 +491,11 @@ def _score_matrix(self, link, X, y, mu, sample_weight, dispersion, fit_intercept
             * link.inverse_derivative(linpred)
             * (y - mu)
         ).reshape(-1, 1)
-
+        XW = X.multiply(W)
         if fit_intercept:
-            if sparse.issparse(X):
-                return sparse.hstack((W, X.multiply(W)))
-            elif isinstance(X, (SplitMatrix, CategoricalMatrix)):
-                return SplitMatrix((DenseMatrix(W), X.multiply(W)))
-            else:
-                return np.hstack((W, np.multiply(X, W)))
+            return hstack((W, XW))
         else:
-            if sparse.issparse(X) or isinstance(X, (SplitMatrix, CategoricalMatrix)):
-                return X.multiply(W)
-            else:
-                return np.multiply(X, W)
+            return XW
 
     def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
         r"""Estimate the dispersion parameter :math:`\phi`.
@@ -565,10 +516,6 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
 
         method = {'pearson', 'deviance'}, optional (default='pearson')
             Whether to base the estimate on the Pearson residuals or the deviance.
-
-        Returns
-        -------
-        float
         """
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
@@ -594,24 +541,27 @@ class TweedieDistribution(ExponentialDispersionModel):
 
     A Tweedie distribution with mean :math:`\mu = \mathrm{E}(Y)` is uniquely
     defined by its mean-variance relationship
-    :math:`\mathrm{var}(Y) \propto \mu^{\mathrm{power}}`.
+    :math:`\mathrm{var}(Y) \propto \mu^{\mathrm{p}}`.
 
     Special cases are:
 
-    ====== ================
-    Power  Distribution
-    ====== ================
-    0      Normal
-    1      Poisson
-    (1, 2) Compound Poisson
-    2      Gamma
-    3      Inverse Gaussian
-    ====== ================
+    ====== ================ ============
+    Power  Distribution     Support
+    ====== ================ ============
+    0      Normal           ``(-∞, +∞)``
+    1      Poisson          ``[0, +∞)``
+    (1, 2) Compound Poisson ``[0, +∞)``
+    2      Gamma            ``(0, +∞)``
+    3      Inverse Gaussian ``(0, +∞)``
+    ====== ================ ============
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
 
     Parameters
     ----------
     power : float, optional (default=0)
-        The variance power of the `unit_variance`
+        The variance power of the ``unit_variance``
         :math:`v(\mu) = \mu^{\mathrm{power}}`. For
         :math:`0 < \mathrm{power} < 1`, no distribution exists.
     """
@@ -626,9 +576,11 @@ def __init__(self, power=0):
     def __eq__(self, other):  # noqa D
         return isinstance(other, TweedieDistribution) and (self.power == other.power)
 
+    def __tweedie_repr__(self):  # noqa D
+        return self.__class__(self.power)
+
     @property
-    def lower_bound(self) -> float:
-        """Return the lowest value of ``y`` allowed."""
+    def lower_bound(self) -> float:  # noqa D
         if self.power <= 0:
             return -np.inf
         if self.power >= 1:
@@ -636,8 +588,7 @@ def lower_bound(self) -> float:
         raise ValueError
 
     @property
-    def include_lower_bound(self) -> bool:
-        """Return whether ``lower_bound`` is allowed as a value of ``y``."""
+    def include_lower_bound(self) -> bool:  # noqa D
         if self.power <= 0:
             return False
         if (self.power >= 1) and (self.power < 2):
@@ -653,97 +604,66 @@ def power(self) -> float:
 
     @power.setter
     def power(self, power):
-        if not isinstance(power, (int, float)):
-            raise TypeError(f"power must be an int or float, input was {power}")
+
+        if not isinstance(power, (int, float, np.number)):
+            raise TypeError(f"The power parameter must be numeric; got {power}.")
         if (power > 0) and (power < 1):
-            raise ValueError("For 0<power<1, no distribution exists.")
+            raise ValueError("For `0 < p < 1`, no distribution exists.")
 
         # Prevents upcasting when working with 32-bit data
         self._power = power if isinstance(power, int) else np.float32(power)
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Compute the unit variance of a Tweedie distribution ``v(mu) = mu^power``.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance(self, mu):  # noqa D
         p = self.power  # noqa: F841
         return numexpr.evaluate("mu ** p")
 
-    def unit_variance_derivative(self, mu: np.ndarray) -> np.ndarray:
-        r"""Compute the derivative of the unit variance of a Tweedie distribution.
-
-        Equation: :math:`v(\mu) = p \times \mu^{(p-1)}`.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance_derivative(self, mu):  # noqa D
         p = self.power  # noqa: F841
         return numexpr.evaluate("p * mu ** (p - 1)")
 
-    def deviance(self, y, mu, sample_weight=None) -> float:
-        """Compute the deviance.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
 
-        sample_weight : array-like, shape (n_samples,), optional (default=1)
-            Sample weights.
-        """
-        p = self.power
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
         # NOTE: the dispersion parameter is only necessary to convey
         # type information on account of a bug in Cython
 
-        if p == 0:
+        if self.power == 0:
             return normal_deviance(y, sample_weight, mu, dispersion=1.0)
-        if p == 1:
+        if self.power == 1:
             return poisson_deviance(y, sample_weight, mu, dispersion=1.0)
-        elif p == 2:
+        elif self.power == 2:
             return gamma_deviance(y, sample_weight, mu, dispersion=1.0)
+        elif self.power == 3:
+            return inv_gaussian_deviance(y, sample_weight, mu, dispersion=1.0)
         else:
-            return tweedie_deviance(y, sample_weight, mu, p=float(p))
+            return tweedie_deviance(y, sample_weight, mu, p=float(self.power))
 
-    def unit_deviance(self, y, mu):
-        """Get the deviance of each observation."""
-        p = self.power
-        if p == 0:  # Normal distribution
+    def unit_deviance(self, y, mu):  # noqa D
+
+        if self.power == 0:  # normal distribution
             return (y - mu) ** 2
-        if p == 1:  # Poisson distribution
+        if self.power == 1:  # Poisson distribution
             return 2 * (special.xlogy(y, y / mu) - y + mu)
-        elif p == 2:  # Gamma distribution
+        elif self.power == 2:  # Gamma distribution
             return 2 * (np.log(mu / y) + y / mu - 1)
+        elif self.power == 3:  # inverse Gaussian distribution
+            return ((y / mu - 1) ** 2) / y
         else:
-            mu1mp = mu ** (1 - p)
+            mu1mp = mu ** (1 - self.power)
             return 2 * (
-                (np.maximum(y, 0) ** (2 - p)) / ((1 - p) * (2 - p))
-                - y * mu1mp / (1 - p)
-                + mu * mu1mp / (2 - p)
+                (np.maximum(y, 0) ** (2 - self.power))
+                / ((1 - self.power) * (2 - self.power))
+                - y * mu1mp / (1 - self.power)
+                + mu * mu1mp / (2 - self.power)
             )
 
     def _rowwise_gradient_hessian(
         self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
     ):
         f = None
+
         if self.power == 0 and isinstance(link, IdentityLink):
             f = normal_identity_rowwise_gradient_hessian
         elif self.power == 1 and isinstance(link, LogLink):
@@ -752,6 +672,8 @@ def _rowwise_gradient_hessian(
             f = gamma_log_rowwise_gradient_hessian
         elif 1 < self.power < 2 and isinstance(link, LogLink):
             f = partial(tweedie_log_rowwise_gradient_hessian, p=self.power)
+        elif self.power == 3:
+            f = partial(inv_gaussian_log_rowwise_gradient_hessian, p=self.power)
 
         if f is not None:
             return f(y, sample_weight, eta, mu, gradient_rows, hessian_rows)
@@ -772,6 +694,7 @@ def _eta_mu_deviance(
         mu_out: np.ndarray,
     ):
         f = None
+
         if self.power == 0 and isinstance(link, IdentityLink):
             f = normal_identity_eta_mu_deviance
         elif self.power == 1 and isinstance(link, LogLink):
@@ -780,6 +703,8 @@ def _eta_mu_deviance(
             f = gamma_log_eta_mu_deviance
         elif 1 < self.power < 2 and isinstance(link, LogLink):
             f = partial(tweedie_log_eta_mu_deviance, p=self.power)
+        elif self.power == 3 and isinstance(link, LogLink):
+            f = partial(inv_gaussian_log_eta_mu_deviance, p=self.power)
 
         if f is not None:
             return f(cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor)
@@ -791,7 +716,7 @@ def _eta_mu_deviance(
     def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
         r"""Compute the log likelihood.
 
-        For ``1 < power < 2``, we use the series approximation by Dunn and Smyth
+        For ``1 < p < 2``, we use the series approximation by Dunn and Smyth
         (2005) to compute the normalization term.
 
         Parameters
@@ -827,33 +752,14 @@ def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
             return tweedie_log_likelihood(
                 y, sample_weight, mu, float(p), float(dispersion)
             )
+        elif p == 3:
+            return inv_gaussian_log_likelihood(y, sample_weight, mu, float(dispersion))
         else:
             raise NotImplementedError
 
-    def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
-        r"""Estimate the dispersion parameter :math:`\phi`.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Weights or exposure to which variance is inversely proportional.
-
-        ddof : int, optional (default=1)
-            Degrees of freedom consumed by the model for ``mu``.
-
-        method = {'pearson', 'deviance'}, optional (default='pearson')
-            Whether to base the estimate on the Pearson residuals or the deviance.
-
-        Returns
-        -------
-        float
-        """
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
         p = self.power  # noqa: F841
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
@@ -870,163 +776,167 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
         )
 
 
-class NormalDistribution(TweedieDistribution):
-    """Class for the Normal (a.k.a. Gaussian) distribution."""
+class NormalDistribution(ExponentialDispersionModel):
+    """Class for the normal (a.k.a. Gaussian) distribution.
 
-    def __init__(self):
-        super().__init__(power=0)
+    The normal distribution models outcomes ``y`` in ``(-∞, +∞)``.
 
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
 
-class PoissonDistribution(TweedieDistribution):
-    """Class for the scaled Poisson distribution."""
+    lower_bound = -np.inf
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
 
-    def __init__(self):
-        super().__init__(power=1)
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
 
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(0)
 
-class GammaDistribution(TweedieDistribution):
-    """Class for the Gamma distribution."""
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return 1 if np.isscalar(mu) else np.ones_like(mu)
 
-    def __init__(self):
-        super().__init__(power=2)
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 0 if np.isscalar(mu) else np.zeros_like(mu)
 
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
 
-class InverseGaussianDistribution(TweedieDistribution):
-    """Class for the scaled Inverse Gaussian distribution."""
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-    def __init__(self):
-        super().__init__(power=3)
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
 
+        return normal_deviance(y, sample_weight, mu, dispersion=1.0)
 
-class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
-    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+    def unit_deviance(self, y, mu):  # noqa D
+        return (y - mu) ** 2
 
-    The GHS distribution is for targets ``y`` in ``(-∞, +∞)``.
-    """
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        if isinstance(link, IdentityLink):
+            return normal_identity_rowwise_gradient_hessian(
+                y, sample_weight, eta, mu, gradient_rows, hessian_rows
+            )
 
-    lower_bound = -np.inf
-    upper_bound = np.inf
-    include_lower_bound = False
-    include_upper_bound = False
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
 
-    def __eq__(self, other):  # noqa D
-        return isinstance(other, self.__class__)
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, IdentityLink):
+            return normal_identity_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
+            )
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level expected variance.
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
 
-        See superclass documentation.
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
 
         Parameters
         ----------
-        mu : array-like or float
+        y : array-like, shape (n_samples,)
+            Target values.
 
-        Returns
-        -------
-        array-like
-        """
-        return 1 + mu**2
+        mu : array-like, shape (n_samples,)
+            Predicted mean.
 
-    def unit_variance_derivative(self, mu: np.ndarray) -> np.ndarray:
-        """Get the derivative of the unit variance.
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
 
-        See superclass documentation.
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
+        """
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-        Parameters
-        ----------
-        mu : array-like or float
+        if dispersion is None:
+            dispersion = self.dispersion(y, mu, sample_weight)
 
-        Returns
-        -------
-        array-like
-        """
-        return 2 * mu
+        return normal_log_likelihood(y, sample_weight, mu, float(dispersion))
 
-    def unit_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level deviance.
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
-        See superclass documentation.
+        if method == "pearson":
+            formula = "(y - mu) ** 2"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
 
-        Parameters
-        ----------
-        y : array-like
-        mu : array-like
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
 
-        Returns
-        -------
-        array-like
-        """
-        return 2 * y * (np.arctan(y) - np.arctan(mu)) + np.log((1 + mu**2) / (1 + y**2))
 
+class PoissonDistribution(ExponentialDispersionModel):
+    """Class for the scaled Poisson distribution.
 
-class BinomialDistribution(ExponentialDispersionModel):
-    """A class for the Binomial distribution.
+    The Poisson distribution models discrete outcomes ``y`` in ``[0, +∞)``.
 
-    The Binomial distribution is for targets ``y`` in ``[0, 1]``.
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
     """
 
     lower_bound = 0
-    upper_bound = 1
+    upper_bound = np.inf
     include_lower_bound = True
-    include_upper_bound = True
+    include_upper_bound = False
 
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level expected variance.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu : array-like
-
-        Returns
-        -------
-        array-like
-        """
-        return mu * (1 - mu)
-
-    def unit_variance_derivative(self, mu):
-        """Get the derivative of the unit variance.
-
-        See superclass documentation.
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(1)
 
-        Parameters
-        ----------
-        mu : array-like or float
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return mu
 
-        Returns
-        -------
-        array-like
-        """
-        return 1 - 2 * mu
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 1.0 if np.isscalar(mu) else np.ones_like(mu)
 
-    def unit_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray:
-        """Get the unit-level deviance.
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-        See superclass documentation.
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
 
-        Parameters
-        ----------
-        y : array-like
-        mu : array-like
+        return poisson_deviance(y, sample_weight, mu, dispersion=1.0)
 
-        Returns
-        -------
-        array-like
-        """
-        # see Wooldridge and Papke (1996) for the fractional case
-        return -2 * (special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu))
+    def unit_deviance(self, y, mu):
+        """Compute the unit deviance."""
+        return 2 * (special.xlogy(y, y / mu) - y + mu)
 
     def _rowwise_gradient_hessian(
         self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
     ):
-        if isinstance(link, LogitLink):
-            return binomial_logit_rowwise_gradient_hessian(
+        if isinstance(link, LogLink):
+            return poisson_log_rowwise_gradient_hessian(
                 y, sample_weight, eta, mu, gradient_rows, hessian_rows
             )
+
         return super()._rowwise_gradient_hessian(
             link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
         )
@@ -1035,23 +945,24 @@ def _eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta_out: np.ndarray,
-        mu_out: np.ndarray,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
     ):
-        if isinstance(link, LogitLink):
-            return binomial_logit_eta_mu_deviance(
+        if isinstance(link, LogLink):
+            return poisson_log_eta_mu_deviance(
                 cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
             )
+
         return super()._eta_mu_deviance(
             link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
         )
 
-    def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
-        """Compute the log likelihood.
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
 
         Parameters
         ----------
@@ -1064,14 +975,108 @@ def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
         sample_weight : array-like, shape (n_samples,), optional (default=1)
             Sample weights.
 
-        dispersion : float, optional (default=1)
-            Ignored.
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
         """
-        ll = special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu)
-        return np.sum(ll) if sample_weight is None else np.dot(ll, sample_weight)
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-    def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
-        r"""Estimate the dispersion parameter :math:`\phi`.
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
+
+        return poisson_log_likelihood(y, sample_weight, mu, 1.0)
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+
+        if method == "pearson":
+            formula = "((y - mu) ** 2) / mu"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
+
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
+
+
+class GammaDistribution(ExponentialDispersionModel):
+    """Class for the gamma distribution.
+
+    The gamma distribution models outcomes ``y`` in ``(0, +∞)``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = 0
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(2)
+
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return mu**2
+
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 2 * mu
+
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
+
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
+
+        # NOTE: the dispersion parameter is only necessary to convey
+        # type information on account of a bug in Cython
+
+        return gamma_deviance(y, sample_weight, mu, dispersion=1.0)
+
+    def unit_deviance(self, y, mu):  # noqa D
+        return 2 * (np.log(mu / y) + y / mu - 1)
+
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        if isinstance(link, LogLink):
+            return gamma_log_rowwise_gradient_hessian(
+                y, sample_weight, eta, mu, gradient_rows, hessian_rows
+            )
+
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
+
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, LogLink):
+            return gamma_log_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
+            )
+
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
+
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
 
         Parameters
         ----------
@@ -1081,19 +1086,251 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
         mu : array-like, shape (n_samples,)
             Predicted mean.
 
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Weights or exposure to which variance is inversely proportional.
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
 
-        ddof : int, optional (default=1)
-            Degrees of freedom consumed by the model for ``mu``.
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
+        """
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
-        method = {'pearson', 'deviance'}, optional (default='pearson')
-            Whether to base the estimate on the Pearson residuals or the deviance.
+        if dispersion is None:
+            dispersion = self.dispersion(y, mu, sample_weight)
 
-        Returns
-        -------
-        float
+        return gamma_log_likelihood(y, sample_weight, mu, float(dispersion))
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+
+        if method == "pearson":
+            formula = "((y - mu) ** 2) / (mu ** 2)"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
+
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
+
+
+class InverseGaussianDistribution(ExponentialDispersionModel):
+    """Class for the inverse Gaussian distribution.
+
+    The inverse Gaussian distribution models outcomes ``y`` in ``(0, +∞)``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = 0
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieDistribution(3)
+
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return mu**3
+
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 3 * (mu**2)
+
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
+
+        return tweedie_deviance(y, sample_weight, mu, p=3.0)
+
+    def unit_deviance(self, y, mu):  # noqa D
+        return numexpr.evaluate("y / (mu**2) + 1 / y - 2 / mu")
+
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
+
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, LogLink):
+            return tweedie_log_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor, p=3.0
+            )
+
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
+
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=None) -> float:
+        r"""Compute the log likelihood.
+
+        Parameters
+        ----------
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        mu : array-like, shape (n_samples,)
+            Predicted mean.
+
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
+
+        dispersion : float, optional (default=None)
+            Dispersion parameter :math:`\phi`. Estimated if ``None``.
+        """
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+        sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
+
+        if dispersion is None:
+            dispersion = self.dispersion(y, mu, sample_weight)
+
+        return tweedie_log_likelihood(y, sample_weight, mu, 3.0, float(dispersion))
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
+        y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
+
+        if method == "pearson":
+            formula = "((y - mu) ** 2) / (mu ** 3)"
+            if sample_weight is None:
+                return numexpr.evaluate(formula).sum() / (len(y) - ddof)
+            else:
+                formula = f"sample_weight * {formula}"
+                return numexpr.evaluate(formula).sum() / (sample_weight.sum() - ddof)
+
+        return super().dispersion(
+            y, mu, sample_weight=sample_weight, ddof=ddof, method=method
+        )
+
+
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+
+    The GHS distribution models outcomes ``y`` in ``(-∞, +∞)``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = -np.inf
+    upper_bound = np.inf
+    include_lower_bound = False
+    include_upper_bound = False
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def unit_variance(self, mu) -> np.ndarray:  # noqa D
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
+        return 2 * mu
+
+    def unit_deviance(self, y, mu) -> np.ndarray:  # noqa D
+        return 2 * y * (np.arctan(y) - np.arctan(mu)) + np.log((1 + mu**2) / (1 + y**2))
+
+
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution models outcomes ``y`` in ``[0, 1]``.
+
+    See the documentation of the superclass,
+    :class:`~glum.ExponentialDispersionModel`, for details.
+    """
+
+    lower_bound = 0
+    upper_bound = 1
+    include_lower_bound = True
+    include_upper_bound = True
+
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
+
+    def unit_variance(self, mu):  # noqa D
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):  # noqa D
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):  # noqa D
+        # see Wooldridge and Papke (1996) for the fractional case
+        return -2 * (special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu))
+
+    def _rowwise_gradient_hessian(
+        self, link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+    ):
+        if isinstance(link, LogitLink):
+            return binomial_logit_rowwise_gradient_hessian(
+                y, sample_weight, eta, mu, gradient_rows, hessian_rows
+            )
+        return super()._rowwise_gradient_hessian(
+            link, y, sample_weight, eta, mu, gradient_rows, hessian_rows
+        )
+
+    def _eta_mu_deviance(
+        self,
+        link: Link,
+        factor: float,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
+    ):
+        if isinstance(link, LogitLink):
+            return binomial_logit_eta_mu_deviance(
+                cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out, factor
+            )
+
+        return super()._eta_mu_deviance(
+            link, factor, cur_eta, X_dot_d, y, sample_weight, eta_out, mu_out
+        )
+
+    def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
+        """Compute the log likelihood.
+
+        Parameters
+        ----------
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        mu : array-like, shape (n_samples,)
+            Predicted mean.
+
+        sample_weight : array-like, shape (n_samples,), optional (default=1)
+            Sample weights.
+
+        dispersion : float, optional (default=1)
+            Ignored.
         """
+        ll = special.xlogy(y, mu) + special.xlogy(1 - y, 1 - mu)
+        return np.sum(ll) if sample_weight is None else np.dot(ll, sample_weight)
+
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
         if method == "pearson":
@@ -1112,14 +1349,14 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
 class NegativeBinomialDistribution(ExponentialDispersionModel):
     r"""A class for the Negative Binomial distribution.
 
-    A Negative Binomial distribution with mean :math:`\mu = \mathrm{E}(Y)` is uniquely
-    defined by its mean-variance relationship
+    A negative binomial distribution with mean :math:`\mu = \mathrm{E}(Y)` is
+    uniquely defined by its mean-variance relationship
     :math:`\mathrm{var}(Y) \propto \mu + \theta * \mu^2`.
 
     Parameters
     ----------
     theta : float, optional (default=1.0)
-        The dispersion parameter from `unit_variance`
+        The dispersion parameter from the ``unit_variance``
         :math:`v(\mu) = \mu + \theta * \mu^2`. For
         :math:`\theta <= 0`, no distribution exists.
 
@@ -1146,74 +1383,34 @@ def __eq__(self, other):  # noqa D
 
     @property
     def theta(self) -> float:
-        """Return the Negative Binomial theta parameter."""
+        """Return the negative binomial theta parameter."""
         return self._theta
 
     @theta.setter
     def theta(self, theta):
+
         if not isinstance(theta, (int, float)):
-            raise TypeError(f"theta must be an int or float, input was {theta}")
+            raise TypeError(f"Theta must be numeric; got {theta}.")
         if not theta > 0:
-            raise ValueError(
-                f"theta must be strictly positive number, input was {theta}"
-            )
+            raise ValueError(f"Theta must be strictly positive; got was {theta}.")
 
         # Prevents upcasting when working with 32-bit data
-        self._theta = np.float32(theta)
-
-    def unit_variance(self, mu: np.ndarray) -> np.ndarray:
-        """Compute the unit variance of a Negative Binomial distribution
-        ``v(mu) = mu + theta * mu^2``.
+        self._theta = theta if isinstance(theta, int) else np.float32(theta)
 
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance(self, mu):  # noqa D
         return mu + self.theta * mu**2
 
-    def unit_variance_derivative(self, mu: np.ndarray) -> np.ndarray:
-        r"""Compute the derivative of the unit variance of a Negative Binomial distribution.
-
-        Equation: :math:`v(\mu) = 1 + 2 \times \theta \times \mu`.
-
-        Parameters
-        ----------
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        Returns
-        -------
-        numpy.ndarray, shape (n_samples,)
-        """
+    def unit_variance_derivative(self, mu):  # noqa D
         return 1 + 2 * self.theta * mu
 
-    def deviance(self, y, mu, sample_weight=None) -> float:
-        """Compute the deviance.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=1)
-            Sample weights.
-        """
+    def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
         theta = self.theta
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
         return negative_binomial_deviance(y, sample_weight, mu, theta=float(theta))
 
-    def unit_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray:
-        """Get the deviance of each observation."""
+    def unit_deviance(self, y, mu):  # noqa D
         theta = self.theta
 
         r = 1.0 / theta
@@ -1235,12 +1432,12 @@ def _eta_mu_deviance(
         self,
         link: Link,
         factor: float,
-        cur_eta: np.ndarray,
-        X_dot_d: np.ndarray,
-        y: np.ndarray,
-        sample_weight: np.ndarray,
-        eta_out: np.ndarray,
-        mu_out: np.ndarray,
+        cur_eta,
+        X_dot_d,
+        y,
+        sample_weight,
+        eta_out,
+        mu_out,
     ):
         if isinstance(link, LogLink):
             return negative_binomial_log_eta_mu_deviance(
@@ -1280,30 +1477,9 @@ def log_likelihood(self, y, mu, sample_weight=None, dispersion=1) -> float:
 
         return negative_binomial_log_likelihood(y, sample_weight, mu, float(theta), 1.0)
 
-    def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> float:
-        r"""Estimate the dispersion parameter :math:`\phi`.
-
-        Parameters
-        ----------
-        y : array-like, shape (n_samples,)
-            Target values.
-
-        mu : array-like, shape (n_samples,)
-            Predicted mean.
-
-        sample_weight : array-like, shape (n_samples,), optional (default=None)
-            Weights or exposure to which variance is inversely proportional.
-
-        ddof : int, optional (default=1)
-            Degrees of freedom consumed by the model for ``mu``.
-
-        method = {'pearson', 'deviance'}, optional (default='pearson')
-            Whether to base the estimate on the Pearson residuals or the deviance.
-
-        Returns
-        -------
-        float
-        """
+    def dispersion(  # noqa D
+        self, y, mu, sample_weight=None, ddof=1, method="pearson"
+    ) -> float:
         theta = self.theta  # noqa: F841
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
 
@@ -1321,8 +1497,8 @@ def dispersion(self, y, mu, sample_weight=None, ddof=1, method="pearson") -> flo
 
 
 def guess_intercept(
-    y: np.ndarray,
-    sample_weight: np.ndarray,
+    y,
+    sample_weight,
     link: Link,
     distribution: ExponentialDispersionModel,
     eta: Union[np.ndarray, float] = None,
@@ -1341,51 +1517,77 @@ def guess_intercept(
     avg_y = np.average(y, weights=sample_weight)
 
     if isinstance(link, IdentityLink):
-        # This is only correct for normal. For other distributions, answer is unknown,
-        # but assume that we want sum(y) = sum(mu)
+
+        # This is only correct for the normal. For other distributions, the
+        # answer is unknown, but we assume that we want `sum(y) = sum(mu)`
+
         if eta is None:
             return avg_y
+
         avg_eta = eta if np.isscalar(eta) else np.average(eta, weights=sample_weight)
+
         return avg_y - avg_eta
+
     elif isinstance(link, LogLink):
+
         # This is only correct for Tweedie
+
         log_avg_y = np.log(avg_y)
+
         assert np.isfinite(log_avg_y).all()
 
         if eta is None:
             return log_avg_y
+
         mu = np.exp(eta)
+
         if isinstance(distribution, TweedieDistribution):
             p = distribution.power
+        elif isinstance(distribution, NormalDistribution):
+            p = 0
+        elif isinstance(distribution, PoissonDistribution):
+            p = 1
+        elif isinstance(distribution, GammaDistribution):
+            p = 2
+        elif isinstance(distribution, InverseGaussianDistribution):
+            p = 3
         else:
             p = 1  # Like Poisson
+
         if np.isscalar(mu):
             first = np.log(y.dot(sample_weight) * mu ** (1 - p))
             second = np.log(sample_weight.sum() * mu ** (2 - p))
         else:
             first = np.log((y * mu ** (1 - p)).dot(sample_weight))
             second = np.log((mu ** (2 - p)).dot(sample_weight))
+
         return first - second
+
     elif isinstance(link, LogitLink):
+
         log_odds = np.log(avg_y) - np.log(1 - avg_y)
+
         if eta is None:
             return log_odds
+
         avg_eta = eta if np.isscalar(eta) else np.average(eta, weights=sample_weight)
+
         return log_odds - avg_eta
+
     else:
+
         return link.link(y.dot(sample_weight))
 
 
 def get_one_over_variance(
     distribution: ExponentialDispersionModel,
     link: Link,
-    mu: np.ndarray,
-    eta: np.ndarray,
-    dispersion,
-    sample_weight: np.ndarray,
+    mu,
+    eta,
+    dispersion: float,
+    sample_weight,
 ):
-    """
-    Get one over the variance.
+    """Get one over the variance.
 
     For Tweedie: ``sigma_inv = sample_weight / (mu ** p)`` during optimization,
     because ``phi = 1``.
diff --git a/src/glum/_functions.pyx b/src/glum/_functions.pyx
index 9d805fc9f..6c19a2b0f 100644
--- a/src/glum/_functions.pyx
+++ b/src/glum/_functions.pyx
@@ -41,9 +41,6 @@ def normal_identity_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = eta_out[i]
-        # Note: deviance is equal to -2 times the true log likelihood to match
-        # the default calculation using unit_deviance in _distribution.py
-        # True log likelihood: -1/2 * (y - mu)**2
         deviance += weights[i] * (y[i] - mu_out[i]) ** 2
     return deviance
 
@@ -114,7 +111,6 @@ def poisson_log_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = exp(eta_out[i])
-        # True log likelihood: y * eta - mu - lgamma(1 + y)
         deviance += weights[i] * (y[i] * eta_out[i] - mu_out[i])
     return -2 * deviance
 
@@ -183,7 +179,6 @@ def gamma_log_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = exp(eta_out[i])
-        # True log likelihood: -(y / mu + eta)
         deviance += weights[i] * (y[i] / mu_out[i] + eta_out[i])
     return 2 * deviance
 
@@ -238,6 +233,92 @@ def gamma_deviance(
 
     return 2 * D
 
+def inv_gaussian_log_eta_mu_deviance(
+    const_floating1d cur_eta,
+    const_floating1d X_dot_d,
+    const_floating1d y,
+    const_floating1d weights,
+    floating[:] eta_out,
+    floating[:] mu_out,
+    floating factor
+):
+    cdef int n = cur_eta.shape[0]
+    cdef int i  # loop counter
+    cdef floating sq_err  # helper
+    cdef floating deviance = 0.0  # output
+
+    for i in prange(n, nogil=True):
+
+        eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
+        mu_out[i] = exp(eta_out[i])
+
+        sq_err = (y[i] / mu_out[i] - 1) ** 2
+
+        deviance += weights[i] * sq_err / y[i]
+
+    return deviance
+
+def inv_gaussian_log_rowwise_gradient_hessian(
+    const_floating1d y,
+    const_floating1d weights,
+    const_floating1d eta,
+    const_floating1d mu,
+    floating[:] gradient_rows_out,
+    floating[:] hessian_rows_out
+):
+    cdef int n = eta.shape[0]
+    cdef int i  # loop counter
+
+    cdef floating inv_mu, inv_mu2
+
+    for i in prange(n, nogil=True):
+
+        inv_mu = 1 / mu[i]
+        inv_mu2 = inv_mu ** 2
+
+        gradient_rows_out[i] = 2 * weights[i] * (inv_mu - y[i] * inv_mu2)
+        hessian_rows_out[i] = 2 * weights[i] * (inv_mu - 2 * y[i] * inv_mu2)
+
+def inv_gaussian_log_likelihood(
+    const_floating1d y,
+    const_floating1d weights,
+    const_floating1d mu,
+    floating dispersion,
+):
+    cdef int n = y.shape[0]  # loop length
+    cdef int i  # loop counter
+    cdef floating sum_weights  # helper
+    cdef floating ll = 0.0  # output
+
+    cdef floating sq_err  # helper
+    cdef floating inv_dispersion = 1 / (2 * dispersion)  # helper
+
+    for i in prange(n, nogil=True):
+
+        sq_err = (y[i] / mu[i] - 1) ** 2
+
+        ll -= weights[i] * (inv_dispersion * sq_err / y[i] + log(y[i]) * 3 / 2)
+        sum_weights -= weights[i]
+
+    return ll + sum_weights * log(inv_dispersion / M_PI)
+
+def inv_gaussian_deviance(
+    const_floating1d y,
+    const_floating1d weights,
+    const_floating1d mu,
+    floating dispersion,
+):
+    cdef int i  # loop counter
+    cdef int n = y.shape[0]  # loop length
+    cdef floating sq_err  # helper
+    cdef floating D = 0.0  # output
+
+    for i in prange(n, nogil=True):
+        sq_err = (y[i] / mu[i] - 1) ** 2
+        D += weights[i] * sq_err / y[i]
+
+    return D
+
 def tweedie_log_eta_mu_deviance(
     const_floating1d cur_eta,
     const_floating1d X_dot_d,
@@ -451,7 +532,6 @@ def negative_binomial_log_eta_mu_deviance(
     for i in prange(n, nogil=True):
         eta_out[i] = cur_eta[i] + factor * X_dot_d[i]
         mu_out[i] = exp(eta_out[i])
-        # True log likelihood: y * log(y / mu) - (y + r) * log((y + r) / (mu + r))
         deviance += weights[i] * (-y[i] * eta_out[i] + (y[i] + r) * log(mu_out[i] + r))
     return 2 * deviance
 
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index a92df0c46..b61449033 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -19,14 +19,18 @@
 import re
 import sys
 import warnings
-from collections.abc import Iterable, Sequence
-from itertools import chain
+from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, NamedTuple, Optional, Union, cast
 
 import numpy as np
 import pandas as pd
+import scipy.sparse as sps
 import scipy.sparse.linalg as splinalg
 import tabmat as tm
+from formulaic import Formula, FormulaSpec
+from formulaic.parser import DefaultFormulaParser
+from formulaic.utils.constraints import LinearConstraintParser
+from formulaic.utils.context import capture_context
 from scipy import linalg, sparse, stats
 from sklearn.base import BaseEstimator, RegressorMixin
 from sklearn.utils import check_array
@@ -59,7 +63,13 @@
     _least_squares_solver,
     _trust_constr_solver,
 )
-from ._util import _align_df_categories, _positional_args_deprecated, _safe_toarray
+from ._util import (
+    _add_missing_categories,
+    _align_df_categories,
+    _expand_categorical_penalties,
+    _is_contiguous,
+    _safe_toarray,
+)
 
 _float_itemsize_to_dtype = {8: np.float64, 4: np.float32, 2: np.float16}
 
@@ -92,8 +102,8 @@ class WaldTestResult(NamedTuple):
 def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwargs):
     to_copy = kwargs.get("copy", False)
 
-    if isinstance(mat, pd.DataFrame) and any(mat.dtypes == "category"):
-        mat = tm.from_pandas(mat, drop_first=drop_first)
+    if isinstance(mat, pd.DataFrame):
+        raise RuntimeError("DataFrames should have been converted by this point.")
 
     if isinstance(mat, tm.SplitMatrix):
         kwargs.update({"ensure_min_features": 0})
@@ -116,10 +126,15 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar
         )
 
     original_type = type(mat)
-    res = check_array(mat, **kwargs)
+    if isinstance(mat, (tm.DenseMatrix, tm.SparseMatrix)):
+        res = check_array(mat.unpack(), **kwargs)
+    else:
+        res = check_array(mat, **kwargs)
 
     if res is not mat and original_type in (tm.DenseMatrix, tm.SparseMatrix):
-        res = original_type(res)  # type: ignore
+        res = original_type(
+            res, column_names=mat.column_names, term_names=mat.term_names  # type: ignore
+        )
 
     return res
 
@@ -218,18 +233,49 @@ def _check_offset(
     return offset
 
 
-def _name_categorical_variables(
-    categories: tuple[str], column_name: str, drop_first: bool
-):
-    new_names = [
-        f"{column_name}__{category}" for category in categories[int(drop_first) :]
-    ]
-    if len(new_names) == 0:
-        raise ValueError(
-            f"Categorical column: {column_name}, contains only one category. "
-            + "This should be dropped from the feature matrix."
-        )
-    return new_names
+def _parse_formula(
+    formula: FormulaSpec, include_intercept: bool = True
+) -> tuple[Optional[Formula], Formula]:
+    """
+    Parse and transform the formula for use in a GeneralizedLinearRegressor.
+
+    The left-hand side and right-hand side of the formula are separated. If an
+    intercept is present, it will be removed from the right-hand side, and a
+    boolean flag to indicate whether or not an intercept should be added to
+    the model will be returned.
+
+    Parameters
+    ----------
+    formula : formulaic.FormulaSpec
+        The formula to parse.
+    include_intercept: bool, default True
+        Whether to include an intercept column.
+
+    Returns
+    -------
+    tuple[formulaic.Formula, formulaic.Formula]
+        The left-hand side and right-hand sides of the formula.
+    """
+    if isinstance(formula, str):
+        parser = DefaultFormulaParser(include_intercept=include_intercept)
+        terms = parser.get_terms(formula)
+    elif isinstance(formula, Formula):
+        terms = formula
+    else:
+        raise TypeError("formula must be a string or Formula object.")
+
+    if hasattr(terms, "lhs"):
+        lhs_terms = terms.lhs
+        if len(lhs_terms) != 1:
+            raise ValueError(
+                "formula must have exactly one term on the left-hand side."
+            )
+        rhs_terms = terms.rhs
+    else:
+        lhs_terms = None
+        rhs_terms = terms
+
+    return lhs_terms, rhs_terms
 
 
 def check_bounds(
@@ -453,23 +499,19 @@ def get_family(
 
 def get_link(link: Union[str, Link], family: ExponentialDispersionModel) -> Link:
     """
-    For the Tweedie distribution, this code follows actuarial best practices regarding
-    link functions. Note that these links are sometimes not canonical:
-        - identity for normal (``p=0``);
+    For the Tweedie distribution, this code follows actuarial best practices
+    regarding link functions. Note that these links are sometimes not canonical:
+        - identity for normal (``p = 0``);
         - no convention for ``p < 0``, so let's leave it as identity;
         - log otherwise.
     """
     if isinstance(link, Link):
         return link
-    if link == "auto":
-        if isinstance(family, TweedieDistribution):
-            if family.power <= 0:
+
+    if (link is None) or (link == "auto"):
+        if tweedie_representation := family.to_tweedie(safe=False):
+            if tweedie_representation.power <= 0:
                 return IdentityLink()
-            if family.power < 1:
-                raise ValueError(
-                    "For 0 < p < 1, no Tweedie distribution exists. "
-                    "Please choose a different distribution."
-                )
             return LogLink()
         if isinstance(family, GeneralizedHyperbolicSecant):
             return IdentityLink()
@@ -482,16 +524,20 @@ def get_link(link: Union[str, Link], family: ExponentialDispersionModel) -> Link
             "Please set link manually, i.e. not to 'auto'. "
             f"Got (link='auto', family={family.__class__.__name__})."
         )
-    if link == "identity":
-        return IdentityLink()
-    if link == "log":
-        return LogLink()
-    if link == "logit":
-        return LogitLink()
-    if link == "cloglog":
-        return CloglogLink()
-    if link[:7] == "tweedie":
-        return TweedieLink(float(link[7:]))
+
+    mapping = {
+        "cloglog": CloglogLink(),
+        "identity": IdentityLink(),
+        "log": LogLink(),
+        "logit": LogitLink(),
+        "tweedie": TweedieLink(1.5),
+    }
+
+    if link in mapping:
+        return mapping[link]
+    if custom_tweedie := re.search(r"tweedie\s?\((.+)\)", link):
+        return TweedieLink(float(custom_tweedie.group(1)))
+
     raise ValueError(
         "The link must be an instance of class Link or an element of "
         "['auto', 'identity', 'log', 'logit', 'cloglog', 'tweedie']; "
@@ -675,19 +721,13 @@ def is_pos_semidef(p: Union[sparse.spmatrix, np.ndarray]) -> Union[bool, np.bool
     return np.all(eigenvalues >= epsneg)
 
 
-def _group_sum(groups: np.ndarray, data: np.ndarray):
+def _group_sum(groups: np.ndarray, data: tm.MatrixBase):
     """Sum over groups."""
     ngroups = len(np.unique(groups))
     out = np.empty((ngroups, data.shape[1]))
-    if sparse.issparse(data) or isinstance(
-        data, (tm.SplitMatrix, tm.CategoricalMatrix)
-    ):
-        eye_n = np.eye(ngroups)[:, groups]
-        for i in range(data.shape[1]):
-            out[:, i] = (eye_n @ data.getcol(i)).ravel()
-    else:
-        for i in range(data.shape[1]):
-            out[:, i] = np.bincount(groups, weights=data[:, i])
+    eye_n = sps.eye(ngroups, format="csc")[:, groups]
+    for i in range(data.shape[1]):
+        out[:, i] = _safe_toarray(eye_n @ data.getcol(i).unpack()).ravel()
     return out
 
 
@@ -700,6 +740,7 @@ class GeneralizedLinearRegressorBase(BaseEstimator, RegressorMixin):
 
     def __init__(
         self,
+        *,
         l1_ratio: float = 0,
         P1: Optional[Union[str, np.ndarray]] = "identity",
         P2: Optional[Union[str, np.ndarray, sparse.spmatrix]] = "identity",
@@ -731,6 +772,11 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        formula: Optional[FormulaSpec] = None,
+        interaction_separator: str = ":",
+        categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -763,6 +809,11 @@ def __init__(
         self.drop_first = drop_first
         self.robust = robust
         self.expected_information = expected_information
+        self.formula = formula
+        self.interaction_separator = interaction_separator
+        self.categorical_format = categorical_format
+        self.cat_missing_method = cat_missing_method
+        self.cat_missing_name = cat_missing_name
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -836,6 +887,44 @@ def _get_start_coef(
 
         return coef
 
+    def _convert_from_pandas(
+        self, df: pd.DataFrame, context: Optional[Mapping[str, Any]] = None
+    ) -> tm.MatrixBase:
+        """Convert a pandas data frame to a tabmat matrix."""
+        if hasattr(self, "X_model_spec_"):
+            return self.X_model_spec_.get_model_matrix(df, context=context)
+
+        cat_missing_method_after_alignment = getattr(self, "cat_missing_method", "fail")
+
+        if hasattr(self, "feature_dtypes_"):
+            df = _align_df_categories(
+                df,
+                self.feature_dtypes_,
+                getattr(self, "has_missing_category_", {}),
+                cat_missing_method_after_alignment,
+            )
+            if cat_missing_method_after_alignment == "convert":
+                df = _add_missing_categories(
+                    df=df,
+                    dtypes=self.feature_dtypes_,
+                    feature_names=self.feature_names_,
+                    cat_missing_name=self.cat_missing_name,
+                    categorical_format=self.categorical_format,
+                )
+                # there should be no missing categories after this
+                cat_missing_method_after_alignment = "fail"
+
+        X = tm.from_pandas(
+            df,
+            drop_first=self.drop_first,
+            categorical_format=getattr(  # convention prior to v3
+                self, "categorical_format", "{name}__{category}"
+            ),
+            cat_missing_method=cat_missing_method_after_alignment,
+        )
+
+        return X
+
     def _set_up_for_fit(self, y: np.ndarray) -> None:
         #######################################################################
         # 1. input validation                                                 #
@@ -862,7 +951,9 @@ def _set_up_for_fit(self, y: np.ndarray) -> None:
             elif (self.lower_bounds is None) and (self.upper_bounds is None):
                 if np.all(np.asarray(self.l1_ratio) == 0):
                     self._solver = "irls-ls"
-                elif getattr(self, "alpha", 1) == 0 and not self.alpha_search:
+                elif (
+                    hasattr(self, "alpha") and self.alpha == 0 and not self.alpha_search
+                ):
                     self._solver = "irls-ls"
                 else:
                     self._solver = "irls-cd"
@@ -1123,9 +1214,8 @@ def _solve_regularization_path(
 
         return self.coef_path_
 
-    @_positional_args_deprecated()
     def report_diagnostics(
-        self, full_report: bool = False, custom_columns: Optional[Iterable] = None
+        self, *, full_report: bool = False, custom_columns: Optional[Iterable] = None
     ) -> None:
         """Print diagnostics to ``stdout``.
 
@@ -1139,7 +1229,9 @@ def report_diagnostics(
         custom_columns : iterable, optional (default=None)
             Print only the specified columns.
         """
-        diagnostics = self.get_formatted_diagnostics(full_report, custom_columns)
+        diagnostics = self.get_formatted_diagnostics(
+            full_report=full_report, custom_columns=custom_columns
+        )
         if isinstance(diagnostics, str):
             print(diagnostics)
             return
@@ -1150,11 +1242,10 @@ def report_diagnostics(
         with pd.option_context("display.max_rows", None, "display.max_columns", None):
             print(diagnostics)
 
-    @_positional_args_deprecated()
     def get_formatted_diagnostics(
-        self, full_report: bool = False, custom_columns: Optional[Iterable] = None
+        self, *, full_report: bool = False, custom_columns: Optional[Iterable] = None
     ) -> Union[str, pd.DataFrame]:
-        """Get formatted diagnostics; can be printed with _report_diagnostics.
+        """Get formatted diagnostics which can be printed with report_diagnostics.
 
         Parameters
         ----------
@@ -1208,13 +1299,14 @@ def _find_alpha_index(self, alpha):
             f"{self._alphas}. Consider specifying the index directly via 'alpha_index'."
         )
 
-    @_positional_args_deprecated(("X", "offset"))
     def linear_predictor(
         self,
         X: ArrayLike,
         offset: Optional[ArrayLike] = None,
+        *,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Compute the linear predictor, ``X * coef_ + intercept_``.
 
@@ -1239,6 +1331,14 @@ def linear_predictor(
             Sets the alpha(s) to use in case ``alpha_search`` is ``True``.
             Incompatible with ``alpha_index`` (see above).
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
+
         Returns
         -------
         array, shape (n_samples, n_alphas)
@@ -1253,6 +1353,12 @@ def linear_predictor(
         elif alpha is not None:
             alpha_index = [self._find_alpha_index(a) for a in alpha]  # type: ignore
 
+        if isinstance(X, pd.DataFrame):
+            captured_context = capture_context(
+                context + 1 if isinstance(context, int) else context
+            )
+            X = self._convert_from_pandas(X, context=captured_context)
+
         X = check_array_tabmat_compliant(
             X,
             accept_sparse=["csr", "csc", "coo"],
@@ -1284,14 +1390,15 @@ def linear_predictor(
 
         return xb
 
-    @_positional_args_deprecated(unchanged_args=("X", "sample_weight", "offset"))
     def predict(
         self,
         X: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         alpha_index: Optional[Union[int, Sequence[int]]] = None,
         alpha: Optional[Union[float, Sequence[float]]] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Predict using GLM with feature matrix ``X``.
 
@@ -1319,23 +1426,25 @@ def predict(
             Sets the alpha(s) to use in case ``alpha_search`` is ``True``.
             Incompatible with ``alpha_index`` (see above).
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
+
         Returns
         -------
         array, shape (n_samples, n_alphas)
             Predicted values times ``sample_weight``.
         """
-        if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
-            X = _align_df_categories(X, self.feature_dtypes_)
+        if isinstance(X, pd.DataFrame):
+            captured_context = capture_context(
+                context + 1 if isinstance(context, int) else context
+            )
+            X = self._convert_from_pandas(X, context=captured_context)
 
-        X = check_array_tabmat_compliant(
-            X,
-            accept_sparse=["csr", "csc", "coo"],
-            dtype="numeric",
-            copy=self._should_copy_X(),
-            ensure_2d=True,
-            allow_nd=False,
-            drop_first=getattr(self, "drop_first", False),
-        )
         eta = self.linear_predictor(
             X, offset=offset, alpha_index=alpha_index, alpha=alpha
         )
@@ -1347,19 +1456,20 @@ def predict(
         sample_weight = _check_weights(sample_weight, X.shape[0], X.dtype)
         return mu * sample_weight
 
-    @_positional_args_deprecated(("X", "y", "sample_weight", "offset"), 0)
     def coef_table(
         self,
-        confidence_level=0.95,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        *,
+        confidence_level=0.95,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Get a table of of the regression coefficients.
 
@@ -1394,6 +1504,13 @@ def coef_table(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -1407,6 +1524,9 @@ def coef_table(
             names = self.feature_names_
             beta = self.coef_
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
         if (X is None) and not hasattr(self, "covariance_matrix_"):
             return pd.Series(beta, index=names, name="coef")
 
@@ -1420,6 +1540,7 @@ def coef_table(
             robust=robust,
             clusters=clusters,
             expected_information=expected_information,
+            context=captured_context,
         )
 
         significance_level = 1 - confidence_level
@@ -1442,21 +1563,24 @@ def coef_table(
             index=names,
         )
 
-    @_positional_args_deprecated(("X", "y", "sample_weight", "offset"), 0)
     def wald_test(
         self,
+        X=None,
+        y=None,
+        sample_weight=None,
+        offset=None,
+        *,
         R: Optional[np.ndarray] = None,
         features: Optional[Union[str, list[str]]] = None,
+        terms: Optional[Union[str, list[str]]] = None,
+        formula: Optional[str] = None,
         r: Optional[Sequence] = None,
-        X=None,
-        y=None,
         mu=None,
-        offset=None,
-        sample_weight=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ) -> WaldTestResult:
         """Compute the Wald test statistic and p-value for a linear hypothesis.
 
@@ -1465,32 +1589,43 @@ def wald_test(
         - ``R``: The restriction matrix representing the linear combination of
           coefficients to test.
         - ``features``: The name of a feature or a list of features to test.
+        - ``terms``: The name of a term or a list of terms to test.
+        - ``formula``: A formula string specifying the hypothesis to test.
 
-        The right hand side of the tested hypothesis is specified by ``r``.
+        The right hand side of the tested hypothesis is specified by ``r``. In the
+        case of a ``terms``-based test, the null hypothesis is that each coefficient
+        relating to a term equals the corresponding value in ``r``.
 
         Parameters
         ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
+            Training data. Can be omitted if a covariance matrix has already
+            been computed.
+        y : array-like, shape (n_samples,), optional
+            Target values. Can be omitted if a covariance matrix has already
+            been computed.
+        sample_weight : array-like, shape (n_samples,), optional, default=None
+            Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
         R : np.ndarray, optional, default=None
             The restriction matrix representing the linear combination of coefficients
             to test.
         features : Union[str, list[str]], optional, default=None
             The name of a feature or a list of features to test.
+        terms : Union[str, list[str]], optional, default=None
+            The name of a term or a list of terms to test. It can cover one or more
+            coefficients. In the case of a model based on a formula, a term is one
+            of the expressions separated by ``+`` signs. Otherwise, a term is one column
+            in the input data. As categorical variables need not be one-hot encoded in
+            glum, in their case, the hypothesis to be tested is that the coefficients
+            of all categories are equal to ``r``.
         r : np.ndarray, optional, default=None
             The vector representing the values of the linear combination.
             If None, the test is for whether the linear combinations of the coefficients
             are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
         mu : array-like, optional, default=None
             Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -1503,6 +1638,13 @@ def wald_test(
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
 
         Returns
         -------
@@ -1510,116 +1652,59 @@ def wald_test(
             NamedTuple with test statistic, p-value, and degrees of freedom.
         """
 
-        num_lhs_specs = sum([R is not None, features is not None])
+        num_lhs_specs = sum(
+            [
+                R is not None,
+                features is not None,
+                terms is not None,
+                formula is not None,
+            ]
+        )
         if num_lhs_specs != 1:
             raise ValueError(
-                "Exactly one of R or features must be specified. "
+                "Exactly one of R, features, terms or formula must be specified. "
                 f"Received {num_lhs_specs} specifications."
             )
 
-        if R is not None:
-            return self._wald_test_matrix(
-                R=R,
-                r=r,
-                X=X,
-                y=y,
-                mu=mu,
-                offset=offset,
-                sample_weight=sample_weight,
-                dispersion=dispersion,
-                robust=robust,
-                clusters=clusters,
-                expected_information=expected_information,
-            )
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
+        kwargs = {
+            "X": X,
+            "y": y,
+            "sample_weight": sample_weight,
+            "offset": offset,
+            "mu": mu,
+            "dispersion": dispersion,
+            "robust": robust,
+            "clusters": clusters,
+            "expected_information": expected_information,
+            "context": captured_context,
+        }
 
+        if R is not None:
+            return self._wald_test_matrix(R=R, r=r, **kwargs)
         if features is not None:
-            return self._wald_test_feature_names(
-                features=features,
-                values=r,
-                X=X,
-                y=y,
-                mu=mu,
-                offset=offset,
-                sample_weight=sample_weight,
-                dispersion=dispersion,
-                robust=robust,
-                clusters=clusters,
-                expected_information=expected_information,
-            )
+            return self._wald_test_feature_names(features=features, values=r, **kwargs)
+        if terms is not None:
+            return self._wald_test_term_names(terms=terms, values=r, **kwargs)
+        if formula is not None:
+            if r is not None:
+                raise ValueError("Cannot specify both formula and r")
+            return self._wald_test_formula(formula=formula, **kwargs)
 
         raise RuntimeError("This should never happen")
 
     def _wald_test_matrix(
-        self,
-        R: np.ndarray,
-        r: Optional[np.ndarray] = None,
-        X=None,
-        y=None,
-        mu=None,
-        offset=None,
-        sample_weight=None,
-        dispersion=None,
-        robust=None,
-        clusters: np.ndarray = None,
-        expected_information=None,
+        self, R: np.ndarray, r: Optional[np.ndarray] = None, **kwargs
     ) -> WaldTestResult:
-        """Compute the Wald test statistic and p-value for a linear hypothesis.
-
-        The hypothesis tested is ``R @ coef_ = r``. Under the null hypothesis,
-        the test statistic follows a chi-squared distribution with ``R.shape[0]``
-        degrees of freedom.
-
-        Parameters
-        ----------
-        R : np.ndarray
-            The restriction matrix representing the linear combination of coefficients
-            to test.
-        r : np.ndarray, optional, default=None
-            The vector representing the values of the linear combination.
-            If None, the test is for whether the linear combinations of the coefficients
-            are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
-        dispersion : float, optional, default=None
-            The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=None
-            Whether to compute robust standard errors instead of normal ones.
-            If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
-            Array with cluster membership. Clustered standard errors are
-            computed if clusters is not None.
-        expected_information : boolean, optional, default=None
-            Whether to use the expected or observed information matrix.
-            Only relevant when computing robust standard errors.
-            If not specified, the model's ``expected_information`` attribute is used.
-
-        Returns
-        -------
-        WaldTestResult
-            NamedTuple with test statistic, p-value, and degrees of freedom.
         """
-
-        covariance_matrix = self.covariance_matrix(
-            X=X,
-            y=y,
-            mu=mu,
-            offset=offset,
-            sample_weight=sample_weight,
-            dispersion=dispersion,
-            robust=robust,
-            clusters=clusters,
-            expected_information=expected_information,
-        )
+        Perform a Wald test statistic for a hypothesis specified by constraints
+        given as ``R @ coef_ = r``. Under the null hypothesis, the test statistic
+        follows a chi-squared distribution with ``R.shape[0]`` degrees of freedom.
+        """
+        covariance_matrix = self.covariance_matrix(**kwargs)
 
         if self.fit_intercept:
             beta = np.concatenate([[self.intercept_], self.coef_])
@@ -1657,57 +1742,11 @@ def _wald_test_feature_names(
         self,
         features: Union[str, list[str]],
         values: Optional[Sequence] = None,
-        X=None,
-        y=None,
-        mu=None,
-        offset=None,
-        sample_weight=None,
-        dispersion=None,
-        robust=None,
-        clusters: np.ndarray = None,
-        expected_information=None,
+        **kwargs,
     ) -> WaldTestResult:
-        """Compute the Wald test statistic and p-value for a linear hypothesis.
-
+        """
         Perform a Wald test for the hypothesis that the coefficients of the
         features in ``features`` are equal to the values in ``values``.
-
-        Parameters
-        ----------
-        features: Union[str, list[str]]
-            The name of a feature or a list of features to test.
-        values: Sequence, optional, default=None
-            The values to which coefficients are compared. If None, the test is
-            for whether the coefficients are zero.
-        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
-            Training data. Can be omitted if a covariance matrix has already
-            been computed.
-        y : array-like, shape (n_samples,), optional
-            Target values. Can be omitted if a covariance matrix has already
-            been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
-        sample_weight : array-like, shape (n_samples,), optional, default=None
-            Individual weights for each sample.
-        dispersion : float, optional, default=None
-            The dispersion parameter. Estimated if absent.
-        robust : boolean, optional, default=None
-            Whether to compute robust standard errors instead of normal ones.
-            If not specified, the model's ``robust`` attribute is used.
-        clusters : array-like, optional, default=None
-            Array with cluster membership. Clustered standard errors are
-            computed if clusters is not None.
-        expected_information : boolean, optional, default=None
-            Whether to use the expected or observed information matrix.
-            Only relevant when computing robust standard errors.
-            If not specified, the model's ``expected_information`` attribute is used.
-
-        Returns
-        -------
-        WaldTestResult
-            NamedTuple with test statistic, p-value, and degrees of freedom.
         """
 
         if isinstance(features, str):
@@ -1735,33 +1774,83 @@ def _wald_test_feature_names(
                 raise ValueError(f"feature {feature} is not in the model") from None
             R[i, j] = 1
 
-        return self._wald_test_matrix(
-            R=R,
-            r=r,
-            X=X,
-            y=y,
-            mu=mu,
-            offset=offset,
-            sample_weight=sample_weight,
-            dispersion=dispersion,
-            robust=robust,
-            clusters=clusters,
-            expected_information=expected_information,
-        )
+        return self._wald_test_matrix(R=R, r=r, **kwargs)
+
+    def _wald_test_formula(self, formula: str, **kwargs) -> WaldTestResult:
+        """
+        Perform a Wald test for the hypothesis described in ``formula``.
+        """
+
+        if self.fit_intercept:
+            names = ["intercept"] + list(self.feature_names_)
+        else:
+            names = self.feature_names_
+
+        parser = LinearConstraintParser(names)
+
+        R, r = parser.get_matrix(formula)
+
+        return self._wald_test_matrix(R=R, r=r, **kwargs)
+
+    def _wald_test_term_names(
+        self, terms: Union[str, list[str]], values: Optional[Sequence] = None, **kwargs
+    ) -> WaldTestResult:
+        """
+        Perform a Wald test for the hypothesis that the coefficients of the
+        features in ``terms`` are equal to the values in ``terms``.
+        """
+
+        if isinstance(terms, str):
+            terms = [terms]
+
+        if values is not None:
+            rhs = True
+            if len(terms) != len(values):
+                raise ValueError("terms and values must have the same length")
+        else:
+            rhs = False
+            values = [None] * len(terms)
+
+        if self.fit_intercept:
+            names = np.array(["intercept"] + list(self.term_names_))
+            beta = np.concatenate([[self.intercept_], self.coef_])
+        else:
+            names = np.array(self.feature_names_)
+            beta = self.coef_
+
+        R_list = []
+        r_list = []
+        for term, value in zip(terms, values):
+            R_indices, *_ = np.where(names == term)
+            num_restrictions = len(R_indices)
+            if num_restrictions == 0:
+                raise ValueError(f"term {term} is not in the model")
+            R_current = np.zeros((num_restrictions, len(beta)), dtype=np.float64)
+            R_current[np.arange(num_restrictions), R_indices] = 1.0
+            R_list.append(R_current)
+
+            if rhs:
+                r_list.append(np.full(num_restrictions, fill_value=value))
+
+        R = np.vstack(R_list)
+        r = np.concatenate(r_list) if rhs else None
+
+        return self._wald_test_matrix(R=R, r=r, **kwargs)
 
-    @_positional_args_deprecated(("X", "y", "sample_weight", "offset"), 2)
     def std_errors(
         self,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        *,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: np.ndarray = None,
         expected_information=None,
         store_covariance_matrix=False,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Calculate standard errors for generalized linear models.
 
@@ -1776,12 +1865,12 @@ def std_errors(
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
-        mu : array-like, optional, default=None
-            Array with predictions. Estimated if absent.
-        offset : array-like, optional, default=None
-            Array with additive offsets.
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+        offset : array-like, optional, default=None
+            Array with additive offsets.
+        mu : array-like, optional, default=None
+            Array with predictions. Estimated if absent.
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
         robust : boolean, optional, default=None
@@ -1797,36 +1886,49 @@ def std_errors(
         store_covariance_matrix : boolean, optional, default=False
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         return np.sqrt(
             self.covariance_matrix(
                 X=X,
                 y=y,
-                mu=mu,
-                offset=offset,
                 sample_weight=sample_weight,
+                offset=offset,
+                mu=mu,
                 dispersion=dispersion,
                 robust=robust,
                 clusters=clusters,
                 expected_information=expected_information,
                 store_covariance_matrix=store_covariance_matrix,
+                context=captured_context,
             ).diagonal()
         )
 
-    @_positional_args_deprecated(("X", "y", "sample_weight", "offset"), 2)
     def covariance_matrix(
         self,
         X=None,
         y=None,
-        mu=None,
-        offset=None,
         sample_weight=None,
+        offset=None,
+        *,
+        mu=None,
         dispersion=None,
         robust=None,
         clusters: Optional[np.ndarray] = None,
         expected_information=None,
         store_covariance_matrix=False,
         skip_checks=False,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Calculate the covariance matrix for generalized linear models.
 
@@ -1835,33 +1937,51 @@ def covariance_matrix(
         X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
             Training data. Can be omitted if a covariance matrix has already
             been computed.
+
         y : array-like, shape (n_samples,), optional
             Target values. Can be omitted if a covariance matrix has already
             been computed.
+
         mu : array-like, optional, default=None
             Array with predictions. Estimated if absent.
+
         offset : array-like, optional, default=None
             Array with additive offsets.
+
         sample_weight : array-like, shape (n_samples,), optional, default=None
             Individual weights for each sample.
+
         dispersion : float, optional, default=None
             The dispersion parameter. Estimated if absent.
+
         robust : boolean, optional, default=None
             Whether to compute robust standard errors instead of normal ones.
             If not specified, the model's ``robust`` attribute is used.
+
         clusters : array-like, optional, default=None
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
+
         expected_information : boolean, optional, default=None
             Whether to use the expected or observed information matrix.
             Only relevant when computing robust standard errors.
             If not specified, the model's ``expected_information`` attribute is used.
+
         store_covariance_matrix : boolean, optional, default=False
             Whether to store the covariance matrix in the model instance.
             If a covariance matrix has already been stored, it will be overwritten.
+
         skip_checks : boolean, optional, default=False
             Whether to skip input validation. For internal use only.
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
+
         Notes
         -----
         We support three types of covariance matrices:
@@ -1905,6 +2025,10 @@ def covariance_matrix(
         """
         self.covariance_matrix_: Union[np.ndarray, None]
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         if robust is None:
             _robust = getattr(self, "robust", True)
         else:
@@ -1916,8 +2040,7 @@ def covariance_matrix(
             _expected_information = expected_information
 
         if (
-            (hasattr(self, "alpha") and self.alpha is None)
-            or (
+            (
                 hasattr(self, "alpha")
                 and isinstance(self.alpha, (int, float))
                 and self.alpha > 0
@@ -1931,14 +2054,17 @@ def covariance_matrix(
                 "matrix will be incorrect."
             )
 
+        cannot_estimate_cov = (y is None) and not hasattr(self, "y_model_spec_")
+        cannot_estimate_cov |= X is None
+
         if not skip_checks:
-            if (X is None or y is None) and self.covariance_matrix_ is None:
+            if cannot_estimate_cov and self.covariance_matrix_ is None:
                 raise ValueError(
                     "Either X and y must be provided or the covariance matrix "
                     "must have been previously computed."
                 )
 
-            if (X is None or y is None) and store_covariance_matrix:
+            if cannot_estimate_cov and store_covariance_matrix:
                 raise ValueError(
                     "X and y must be provided if 'store_covariance_matrix' is True."
                 )
@@ -1966,8 +2092,12 @@ def covariance_matrix(
                     )
                 return self.covariance_matrix_
 
-            if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"):
-                X = _align_df_categories(X, self.feature_dtypes_)
+            if hasattr(self, "y_model_spec_"):
+                y = self.y_model_spec_.get_model_matrix(X).toarray().ravel()
+                # This has to go first because X is modified in the next line
+
+            if isinstance(X, pd.DataFrame):
+                X = self._convert_from_pandas(X, context=captured_context)
 
             X, y = check_X_y_tabmat_compliant(
                 X,
@@ -2048,10 +2178,7 @@ def covariance_matrix(
                     / (sum_weights - self.n_features_in_ - int(self.fit_intercept))
                 )
             else:
-                if isinstance(gradient, tm.SplitMatrix):
-                    inner_part = gradient.sandwich(np.ones_like(y))
-                else:
-                    inner_part = gradient.T @ gradient
+                inner_part = gradient.sandwich(np.ones_like(y, dtype=X.dtype))
                 correction = sum_weights / (
                     sum_weights - self.n_features_in_ - int(self.fit_intercept)
                 )
@@ -2086,6 +2213,8 @@ def score(
         y: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Compute :math:`D^2`, the percentage of deviance explained.
 
@@ -2112,6 +2241,14 @@ def score(
 
         offset : array-like, shape (n_samples,), optional (default=None)
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
+
         Returns
         -------
         float
@@ -2120,8 +2257,12 @@ def score(
         # Note, default score defined in RegressorMixin is R^2 score.
         # TODO: make D^2 a score function in module metrics (and thereby get
         #       input validation and so on)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         sample_weight = _check_weights(sample_weight, y.shape[0], y.dtype)
-        mu = self.predict(X, offset=offset)
+        mu = self.predict(X, offset=offset, context=captured_context)
         family = get_family(self.family)
         dev = family.deviance(y, mu, sample_weight=sample_weight)
         y_mean = np.average(y, weights=sample_weight)
@@ -2228,15 +2369,6 @@ def _should_copy_X(self):
         # If self.copy_X is False, check for data of wrong dtype and error if it exists.
         return self.copy_X or False
 
-    def _is_contiguous(self, X):
-        if isinstance(X, np.ndarray):
-            return X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]
-        elif isinstance(X, pd.DataFrame):
-            return self._is_contiguous(X.values)
-        else:
-            # If not a numpy array or pandas data frame, we assume it is contiguous.
-            return True
-
     def _set_up_and_check_fit_args(
         self,
         X: ArrayLike,
@@ -2245,6 +2377,7 @@ def _set_up_and_check_fit_args(
         offset: Optional[VectorLike],
         solver: str,
         force_all_finite,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> tuple[
         tm.MatrixBase,
         np.ndarray,
@@ -2261,76 +2394,94 @@ def _set_up_and_check_fit_args(
         P2 = self.P2
 
         copy_X = self._should_copy_X()
+        drop_first = getattr(self, "drop_first", False)
 
         if isinstance(X, pd.DataFrame):
-            self.feature_dtypes_ = X.dtypes.to_dict()
-
-            if any(X.dtypes == "category"):
-                self.feature_names_ = list(
-                    chain.from_iterable(
-                        (
-                            _name_categorical_variables(
-                                dtype.categories,
-                                column,
-                                getattr(self, "drop_first", False),
-                            )
-                            if isinstance(dtype, pd.CategoricalDtype)
-                            else [column]
-                        )
-                        for column, dtype in zip(X.columns, X.dtypes)
-                    )
+            if hasattr(self, "formula") and self.formula is not None:
+                lhs, rhs = _parse_formula(
+                    self.formula, include_intercept=self.fit_intercept
                 )
 
-                def _expand_categorical_penalties(penalty, X, drop_first):
-                    """
-                    If P1 or P2 has the same shape as X before expanding the
-                    categoricals, we assume that the penalty at the location of
-                    the categorical is the same for all levels.
-                    """
-                    if isinstance(penalty, str):
-                        return penalty
-                    if not sparse.issparse(penalty):
-                        penalty = np.asanyarray(penalty)
-
-                    if penalty.shape[0] == X.shape[1]:
-                        if penalty.ndim == 2:
-                            raise ValueError(
-                                "When the penalty is two dimensional, it has "
-                                "to have the same length as the number of "
-                                "columns of X, after the categoricals "
-                                "have been expanded."
-                            )
-                        return np.array(
-                            list(
-                                chain.from_iterable(
-                                    (
-                                        [
-                                            elmt
-                                            for _ in dtype.categories[int(drop_first) :]
-                                        ]
-                                        if isinstance(dtype, pd.CategoricalDtype)
-                                        else [elmt]
-                                    )
-                                    for elmt, dtype in zip(penalty, X.dtypes)
-                                )
-                            )
+                if lhs is not None:
+                    if y is not None:
+                        raise ValueError(
+                            "`y` is not allowed when using a two-sided formula. "
+                            "Either set `y=None` or use a one-sided formula."
                         )
-                    else:
-                        return penalty
 
-                P1 = _expand_categorical_penalties(
-                    self.P1, X, getattr(self, "drop_first", False)
-                )
-                P2 = _expand_categorical_penalties(
-                    self.P2, X, getattr(self, "drop_first", False)
+                    y = tm.from_formula(
+                        formula=lhs,
+                        data=X,
+                        include_intercept=False,
+                        context=context,
+                    )
+
+                    self.y_model_spec_ = y.model_spec
+                    y = y.toarray().ravel()
+
+                X = tm.from_formula(
+                    formula=rhs,
+                    data=X,
+                    include_intercept=False,
+                    ensure_full_rank=self.drop_first,
+                    categorical_format=self.categorical_format,
+                    cat_missing_method=self.cat_missing_method,
+                    interaction_separator=self.interaction_separator,
+                    add_column_for_intercept=False,
+                    context=context,
                 )
 
-                X = tm.from_pandas(X, drop_first=getattr(self, "drop_first", False))
+                intercept = "1" in X.model_spec.terms
+                if intercept != self.fit_intercept:
+                    raise ValueError(
+                        f"The formula sets the intercept to {intercept}, "
+                        f"contradicting fit_intercept={self.fit_intercept}. "
+                        "You should use fit_intercept to specify the intercept."
+                    )
+
+                self.X_model_spec_ = X.model_spec
+                self.feature_names_ = list(X.model_spec.column_names)
+
+                self.term_names_ = [
+                    term
+                    for term, _, cols in X.model_spec.structure
+                    for _ in range(len(cols))
+                ]
+
             else:
-                self.feature_names_ = X.columns
-                X = tm.from_pandas(X)
+                # Maybe TODO: expand categorical penalties with formulas
+
+                self.feature_dtypes_ = X.dtypes.to_dict()
 
-        if not self._is_contiguous(X):
+                self.has_missing_category_ = {
+                    col: (getattr(self, "cat_missing_method", "fail") == "convert")
+                    and X[col].isna().any()
+                    for col, dtype in self.feature_dtypes_.items()
+                    if isinstance(dtype, pd.CategoricalDtype)
+                }
+
+                if any(X.dtypes == "category"):
+                    P1 = _expand_categorical_penalties(
+                        self.P1, X, drop_first, self.has_missing_category_
+                    )
+                    P2 = _expand_categorical_penalties(
+                        self.P2, X, drop_first, self.has_missing_category_
+                    )
+
+                X = tm.from_pandas(
+                    X,
+                    drop_first=drop_first,
+                    categorical_format=getattr(  # convention prior to v3
+                        self, "categorical_format", "{name}__{category}"
+                    ),
+                    cat_missing_method=getattr(self, "cat_missing_method", "fail"),
+                    cat_missing_name=getattr(self, "cat_missing_name", "(MISSING)"),
+                )
+
+        if y is None:
+            raise ValueError("y cannot be None when not using a two-sided formula.")
+
+        if not _is_contiguous(X):
             if self.copy_X is not None and not self.copy_X:
                 raise ValueError(
                     "The X matrix is noncontiguous and copy_X = False."
@@ -2399,10 +2550,10 @@ def _expand_categorical_penalties(penalty, X, drop_first):
         #######################################################################
         # 2b. convert to wrapper matrix types
         #######################################################################
-        if sparse.issparse(X) and not isinstance(X, tm.SparseMatrix):
-            X = tm.SparseMatrix(X)
-        elif isinstance(X, np.ndarray):
-            X = tm.DenseMatrix(X)
+        X = tm.as_tabmat(X)
+
+        self.feature_names_ = X.get_names(type="column", missing_prefix="_col_")
+        self.term_names_ = X.get_names(type="term", missing_prefix="_col_")
 
         return X, y, sample_weight, offset, weights_sum, P1, P2
 
@@ -2447,11 +2598,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     alpha : {float, array-like}, optional (default=None)
         Constant that multiplies the penalty terms and thus determines the
         regularization strength. If ``alpha_search`` is ``False`` (the default),
-        then ``alpha`` must be a scalar or None (equivalent to ``alpha=1.0``).
+        then ``alpha`` must be a scalar or None (equivalent to ``alpha=0``).
         If ``alpha_search`` is ``True``, then ``alpha`` must be an iterable or
         ``None``. See ``alpha_search`` to find how the regularization path is
         set if ``alpha`` is ``None``. See the notes for the exact mathematical
-        meaning of this parameter. ``alpha = 0`` is equivalent to unpenalized
+        meaning of this parameter. ``alpha=0`` is equivalent to unpenalized
         GLMs. In this case, the design matrix ``X`` must have full column rank
         (no collinearities).
 
@@ -2647,10 +2798,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         set ``verbose`` to any positive number for verbosity.
 
     scale_predictors: bool, optional (default=False)
-        If ``True``, estimate a scaled model where all predictors have a
-        standard deviation of 1. This can result in better estimates if
-        predictors are on very different scales (for example, centimeters and
-        kilometers).
+        If ``True``, scale all predictors to have standard deviation one.
+        Should be set to ``True`` if ``alpha > 0`` and if you want coefficients
+        to be penalized equally.
+
+        Reported coefficient estimates are always at the original scale.
 
         Advanced developer note: Internally, predictors are always rescaled for
         computational reasons, but this only affects results if
@@ -2679,8 +2831,11 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
 
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
-        Set this to True when alpha=0 and solver='auto' to prevent an error due to a singular
-        feature matrix.
+        Set this to True when ``alpha=0`` and ``solver='auto'`` to prevent an error
+        due to a singular feature matrix. In the case of using a formula with
+        interactions, setting this argument to ``True`` ensures structural
+        full-rankness (it is equivalent to ``ensure_full_rank`` in formulaic and
+        tabmat).
 
     robust : bool, optional (default = False)
         If true, then robust standard errors are computed by default.
@@ -2689,6 +2844,32 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
 
+    formula : formulaic.FormulaSpec
+        A formula accepted by formulaic. It can either be a one-sided formula, in
+        which case ``y`` must be specified in ``fit``, or a two-sided formula, in
+        which case ``y`` must be ``None``.
+
+    interaction_separator: str, default=":"
+        The separator between the names of interacted variables.
+
+    categorical_format : str, optional, default='{name}[{category}]'
+        Format string for categorical features. The format string should
+        contain the placeholder ``{name}`` for the feature name and
+        ``{category}`` for the category name. Only used if ``X`` is a pandas
+        DataFrame.
+
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
+        How to handle missing values in categorical columns. Only used if ``X``
+        is a pandas data frame.
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the ``cat_missing_name``
+          category.
+
+    cat_missing_name: str, default='(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -2714,10 +2895,6 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     minimizing the deviance plus penalty term, which is equivalent to
     (penalized) maximum likelihood estimation.
 
-    For ``alpha > 0``, the feature matrix ``X`` should be standardized in order
-    to penalize features equally strong. Call
-    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
-
     If the target ``y`` is a ratio, appropriate sample weights ``s`` should be
     provided. As an example, consider Poisson distributed counts ``z``
     (integers) and weights ``s = exposure`` (time, money, persons years, ...).
@@ -2737,9 +2914,9 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
           https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
     """
 
-    @_positional_args_deprecated()
     def __init__(
         self,
+        *,
         alpha=None,
         l1_ratio=0,
         P1: Optional[Union[str, np.ndarray]] = "identity",
@@ -2773,11 +2950,12 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        formula: Optional[FormulaSpec] = None,
+        interaction_separator: str = ":",
+        categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
-        if alpha is None:
-            warnings.warn(
-                "The default value of `alpha` will become `0` in 3.0.0.", FutureWarning
-            )
         self.alphas = alphas
         self.alpha = alpha
         super().__init__(
@@ -2812,6 +2990,11 @@ def __init__(
             drop_first=drop_first,
             robust=robust,
             expected_information=expected_information,
+            formula=formula,
+            interaction_separator=interaction_separator,
+            categorical_format=categorical_format,
+            cat_missing_method=cat_missing_method,
+            cat_missing_name=cat_missing_name,
         )
 
     def _validate_hyperparameters(self) -> None:
@@ -2852,17 +3035,18 @@ def _validate_hyperparameters(self) -> None:
             )
         super()._validate_hyperparameters()
 
-    @_positional_args_deprecated(("X", "y", "sample_weight", "offset"))
     def fit(
         self,
         X: ArrayLike,
-        y: ArrayLike,
+        y: Optional[ArrayLike] = None,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         store_covariance_matrix: bool = False,
         clusters: Optional[np.ndarray] = None,
         # TODO: take out weights_sum (or use it properly)
         weights_sum: Optional[float] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """Fit a Generalized Linear Model.
 
@@ -2905,6 +3089,14 @@ def fit(
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
+
         weights_sum: float, optional (default=None)
 
         Returns
@@ -2914,6 +3106,10 @@ def fit(
 
         self._validate_hyperparameters()
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         # NOTE: This function checks if all the entries in X and y are
         # finite. That can be expensive. But probably worthwhile.
         (
@@ -2931,6 +3127,7 @@ def fit(
             offset,
             solver=self.solver,
             force_all_finite=self.force_all_finite,
+            context=captured_context,
         )
         assert isinstance(X, tm.MatrixBase)
         assert isinstance(y, np.ndarray)
@@ -3058,7 +3255,7 @@ def fit(
                 self.coef_ = self.coef_path_[-1]
         else:
             if self.alpha is None:
-                _alpha = 1.0
+                _alpha = 0.0
             else:
                 _alpha = self.alpha
             if _alpha > 0 and self.l1_ratio > 0 and self._solver != "irls-cd":
@@ -3116,6 +3313,7 @@ def _compute_information_criteria(
         X: ShapedArrayLike,
         y: ShapedArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Mapping[str, Any]] = None,
     ):
         """
         Computes and stores the model's degrees of freedom, the 'aic', 'aicc'
@@ -3137,25 +3335,12 @@ def _compute_information_criteria(
         [3] Park, M.Y., 2006. Generalized linear models with regularization;
         Stanford Universty.
         """
-
-        # we require that the log_likelihood be defined
-        model_err_str = (
-            "The computation of the information criteria has only "
-            + "been defined for models with a Binomial likelihood, Negative "
-            + "Binomial likelihood or a Tweedie likelihood with power <= 2."
-        )
-        if not isinstance(
-            self.family_instance,
-            (BinomialDistribution, TweedieDistribution, NegativeBinomialDistribution),
-        ):
-            raise NotImplementedError(model_err_str)
-
-        # the log_likelihood has not been implemented for the InverseGaussianDistribution
-        if (
-            isinstance(self.family_instance, TweedieDistribution)
-            and self.family_instance.power > 2
-        ):
-            raise NotImplementedError(model_err_str)
+        if not hasattr(self.family_instance, "log_likelihood"):
+            raise NotImplementedError(
+                "The family instance does not define a `log_likelihood` method, so "
+                "information criteria cannot be computed. Compatible families include "
+                "the binomial, negative binomial and Tweedie (power<=2 or power=3)."
+            )
 
         ddof = np.sum(np.abs(self.coef_) > np.finfo(self.coef_.dtype).eps)
         k_params = ddof + self.fit_intercept
@@ -3163,16 +3348,16 @@ def _compute_information_criteria(
 
         if nobs != self._num_obs:
             raise ValueError(
-                "The same dataset that was used for training should "
-                + "also be used for the computation of information "
-                + "criteria"
+                "The same dataset that was used for training should also be used for "
+                "the computation of information criteria."
             )
 
-        mu = self.predict(X)
+        mu = self.predict(X, context=context)
         ll = self.family_instance.log_likelihood(y, mu, sample_weight=sample_weight)
 
         aic = -2 * ll + 2 * k_params
         bic = -2 * ll + np.log(nobs) * k_params
+
         if nobs > k_params + 1:
             aicc = aic + 2 * k_params * (k_params + 1) / (nobs - k_params - 1)
         else:
@@ -3183,7 +3368,12 @@ def _compute_information_criteria(
         return True
 
     def aic(
-        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+        *,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """
         Akaike's information criteria. Computed as:
@@ -3202,12 +3392,30 @@ def aic(
             Same data as used in 'fit'
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
-             Same data as used in 'fit'
+            Same data as used in 'fit'
+
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
-        return self._get_info_criteria("aic", X, y, sample_weight)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+        return self._get_info_criteria(
+            "aic", X, y, sample_weight, context=captured_context
+        )
 
     def aicc(
-        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+        *,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """
         Second-order Akaike's information criteria (or small sample AIC).
@@ -3229,8 +3437,21 @@ def aicc(
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
+
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
-        aicc = self._get_info_criteria("aicc", X, y, sample_weight)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+        aicc = self._get_info_criteria(
+            "aicc", X, y, sample_weight, context=captured_context
+        )
         if not aicc:
             raise ValueError(
                 "Model degrees of freedom should be more than training datapoints."
@@ -3238,7 +3459,12 @@ def aicc(
         return aicc
 
     def bic(
-        self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        sample_weight: Optional[ArrayLike] = None,
+        *,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         """
         Bayesian information criterion. Computed as:
@@ -3259,8 +3485,21 @@ def bic(
 
         sample_weight : array-like, shape (n_samples,), optional (default=None)
              Same data as used in 'fit'
+
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
         """
-        return self._get_info_criteria("bic", X, y, sample_weight)
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+        return self._get_info_criteria(
+            "bic", X, y, sample_weight, context=captured_context
+        )
 
     def _get_info_criteria(
         self,
@@ -3268,11 +3507,12 @@ def _get_info_criteria(
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
+        context: Optional[Mapping[str, Any]] = None,
     ):
         check_is_fitted(self, "coef_")
 
         if not hasattr(self, "_info_criteria"):
-            self._compute_information_criteria(X, y, sample_weight)
+            self._compute_information_criteria(X, y, sample_weight, context=context)
 
         if (
             self.alpha is None or (self.alpha is not None and self.alpha > 0)
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 4fbb3d469..04328d371 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -1,7 +1,10 @@
 import copy
-from typing import Optional, Union
+from collections.abc import Mapping
+from typing import Any, Optional, Union
 
 import numpy as np
+from formulaic import FormulaSpec
+from formulaic.utils.context import capture_context
 from joblib import Parallel, delayed
 from sklearn.model_selection._split import check_cv
 
@@ -18,7 +21,7 @@
     setup_p2,
 )
 from ._link import Link, LogLink
-from ._util import _positional_args_deprecated, _safe_lin_pred
+from ._util import _safe_lin_pred
 
 
 class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
@@ -244,6 +247,23 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
 
     drop_first : bool, optional (default = False)
         If ``True``, drop the first column when encoding categorical variables.
+        Set this to True when alpha=0 and solver='auto' to prevent an error due to a
+        singular feature matrix. In the case of using a formula with interactions,
+        setting this argument to ``True`` ensures structural full-rankness (it is
+        equivalent to ``ensure_full_rank`` in formulaic and tabmat).
+
+    formula : FormulaSpec
+        A formula accepted by formulaic. It can either be a one-sided formula, in
+        which case ``y`` must be specified in ``fit``, or a two-sided formula, in
+        which case ``y`` must be ``None``.
+
+    interaction_separator: str, default=":"
+        The separator between the names of interacted variables.
+
+    categorical_format: str, default="{name}[T.{category}]"
+        The format string used to generate the names of categorical variables.
+        Has to include the placeholders ``{name}`` and ``{category}``.
+        Only used if ``formula`` is not ``None``.
 
     Attributes
     ----------
@@ -281,11 +301,29 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
     expected_information : bool, optional (default = False)
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
+
+    categorical_format : str, optional (default = "{name}[{category}]")
+        Format string for categorical features. The format string should
+        contain the placeholder ``{name}`` for the feature name and
+        ``{category}`` for the category name. Only used if ``X`` is a pandas
+        DataFrame.
+
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default='fail'
+        How to handle missing values in categorical columns. Only used if ``X``
+        is a pandas data frame.
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the ``cat_missing_name``
+          category.
+
+    cat_missing_name: str, default='(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
     """
 
-    @_positional_args_deprecated()
     def __init__(
         self,
+        *,
         l1_ratio=0,
         P1="identity",
         P2="identity",
@@ -319,6 +357,11 @@ def __init__(
         drop_first: bool = False,
         robust: bool = True,
         expected_information: bool = False,
+        formula: Optional[FormulaSpec] = None,
+        interaction_separator: str = ":",
+        categorical_format: str = "{name}[{category}]",
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ):
         self.alphas = alphas
         self.cv = cv
@@ -354,6 +397,11 @@ def __init__(
             drop_first=drop_first,
             robust=robust,
             expected_information=expected_information,
+            formula=formula,
+            interaction_separator=interaction_separator,
+            categorical_format=categorical_format,
+            cat_missing_method=cat_missing_method,
+            cat_missing_name=cat_missing_name,
         )
 
     def _validate_hyperparameters(self) -> None:
@@ -372,15 +420,16 @@ def _validate_hyperparameters(self) -> None:
             )
         super()._validate_hyperparameters()
 
-    @_positional_args_deprecated(("X", "y", "sample_weight", "offset"))
     def fit(
         self,
         X: ArrayLike,
         y: ArrayLike,
         sample_weight: Optional[ArrayLike] = None,
         offset: Optional[ArrayLike] = None,
+        *,
         store_covariance_matrix: bool = False,
         clusters: Optional[np.ndarray] = None,
+        context: Optional[Union[int, Mapping[str, Any]]] = None,
     ):
         r"""
         Choose the best model along a 'regularization path' by cross-validation.
@@ -423,9 +472,21 @@ def fit(
             Array with cluster membership. Clustered standard errors are
             computed if clusters is not None.
 
+        context : Optional[Union[int, Mapping[str, Any]]], default=None
+            The context to add to the evaluation context of the formula with,
+            e.g., custom transforms. If an integer, the context is taken from
+            the stack frame of the caller at the given depth. Otherwise, a
+            mapping from variable names to values is expected. By default,
+            no context is added. Set ``context=0`` to make the calling scope
+            available.
+
         """
         self._validate_hyperparameters()
 
+        captured_context = capture_context(
+            context + 1 if isinstance(context, int) else context
+        )
+
         (
             X,
             y,
@@ -441,6 +502,7 @@ def fit(
             offset,
             solver=self.solver,
             force_all_finite=self.force_all_finite,
+            context=captured_context,
         )
 
         #########
@@ -480,7 +542,7 @@ def fit(
         else:
             _stype = ["csc", "csr"]
 
-        def fit_path(
+        def _fit_path(
             self,
             train_idx,
             test_idx,
@@ -613,7 +675,7 @@ def _get_deviance(coef):
             return intercept_path_, coef_path_, deviance_path_
 
         jobs = (
-            delayed(fit_path)(
+            delayed(_fit_path)(
                 self,
                 train_idx=train_idx,
                 test_idx=test_idx,
diff --git a/src/glum/_link.py b/src/glum/_link.py
index 8a19b731f..179afac96 100644
--- a/src/glum/_link.py
+++ b/src/glum/_link.py
@@ -1,21 +1,20 @@
 import warnings
 from abc import ABCMeta, abstractmethod
+from typing import Callable
 
 import numpy as np
 from scipy import special
 
-from ._util import _asanyarray
-
 
 class Link(metaclass=ABCMeta):
-    """Abstract base class for Link functions."""
+    """Abstract base class for link functions."""
 
     @abstractmethod
     def link(self, mu):
-        """Compute the link function ``g(mu)``.
+        """Compute the link function.
 
-        The link function links the mean, ``mu ≡ E(Y)``, to the linear predictor
-        ``X * w``, i.e. ``g(mu)`` is equal to the linear predictor.
+        The link function ``g`` links the mean, ``mu ≡ E(Y)``, to the linear
+        predictor, ``X * w``, so that ``g(mu)`` is equal to the linear predictor.
 
         Parameters
         ----------
@@ -26,7 +25,7 @@ def link(self, mu):
 
     @abstractmethod
     def derivative(self, mu):
-        """Compute the derivative of the link ``g'(mu)``.
+        """Compute the derivative of the link function.
 
         Parameters
         ----------
@@ -37,11 +36,11 @@ def derivative(self, mu):
 
     @abstractmethod
     def inverse(self, lin_pred):
-        """Compute the inverse link function ``h(lin_pred)``.
+        """Compute the inverse link function.
 
-        Gives the inverse relationship between linear predictor,
-        ``lin_pred ≡ X * w``, and the mean, ``mu ≡ E(Y)``, i.e.
-        ``h(lin_pred) = mu``.
+        The inverse link function ``h`` gives the inverse relationship between
+        the linear predictor, ``X * w``, and the mean, ``mu ≡ E(Y)``, so that
+        ``h(X * w) = mu``.
 
         Parameters
         ----------
@@ -52,7 +51,7 @@ def inverse(self, lin_pred):
 
     @abstractmethod
     def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        """Compute the derivative of the inverse link function.
 
         Parameters
         ----------
@@ -63,7 +62,7 @@ def inverse_derivative(self, lin_pred):
 
     @abstractmethod
     def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
+        """Compute second derivative of the inverse link function.
 
         Parameters
         ----------
@@ -72,137 +71,154 @@ def inverse_derivative2(self, lin_pred):
         """
         pass
 
+    def to_tweedie(self, safe=True):
+        """Return the Tweedie representation of a link function if it exists."""
+        if hasattr(self, "__tweedie_repr__"):
+            return self.__tweedie_repr__()
+        if safe:
+            raise ValueError("This link function has no Tweedie representation.")
+        return None
 
-class IdentityLink(Link):
-    """The identity link function ``g(x) = x``."""
 
-    def __eq__(self, other):  # noqa D
-        return isinstance(other, self.__class__)
+def catch_p(fun) -> Callable:
+    """Ensure that linear predictors are compatible with the Tweedie power parameter."""
 
-    def link(self, mu):
-        """Return mu (identity link).
+    def _to_return(*args, **kwargs):
+        with np.errstate(invalid="raise"):
+            try:
+                result = fun(*args, **kwargs)
+            except FloatingPointError as e:
+                raise ValueError(
+                    "Your linear predictors are not supported for power "
+                    f"{args[0].power}. For negative linear predictors, consider using "
+                    "a log link instead."
+                ) from e
+        return result
 
-        See superclass documentation.
+    return _to_return
 
-        Parameters
-        ----------
-        mu: array-like
-        """
-        return _asanyarray(mu)
 
-    def derivative(self, mu):
-        """Get the derivative of the identity link, a vector of ones.
+class TweedieLink(Link):
+    """The Tweedie link function ``x^(1-p)`` if ``p≠1`` and ``log(x)`` if ``p=1``.
 
-        See superclass documentation.
+    See the documentation of the superclass, :class:`~glum.Link`, for details.
+    """
 
-        Parameters
-        ----------
-        mu: array-like
-        """
-        return 1.0 if np.isscalar(mu) else np.ones_like(mu)
+    def __init__(self, power):
+        self.power = power
 
-    def inverse(self, lin_pred):
-        """Compute the inverse link function ``h(lin_pred)``.
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__) and (self.power == other.power)
+
+    def __tweedie__repr__(self):  # noqa D
+        return self.__class__(self.power)
+
+    @property
+    def power(self) -> float:  # noqa D
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, (int, float, np.number)):
+            raise TypeError(f"The power parameter must be numeric; got {power}.")
+        if (power > 0) and (power < 1):
+            raise ValueError("For `0<p<1`, no distribution exists.")
+
+        # Prevents upcasting when working with 32-bit data
+        self._power = power if isinstance(power, int) else np.float32(power)
+
+    def link(self, mu):  # noqa D
+        if self.power == 0:
+            return mu
+        if self.power == 1:
+            return np.log(mu)
+        return mu ** (1 - self.power)
+
+    def derivative(self, mu):  # noqa D
+        if self.power == 0:
+            return 1.0 if np.isscalar(mu) else np.ones_like(mu)
+        if self.power == 1:
+            return 1 / mu
+        return (1 - self.power) * mu ** (-self.power)
 
-        Gives the inverse relationship between linear predictor and the mean
-        ``mu ≡ E(Y)``, i.e. ``h(linear predictor) = mu``.
+    @catch_p
+    def inverse(self, lin_pred):  # noqa D
+        if self.power == 0:
+            return lin_pred
+        if self.power == 1:
+            return np.exp(lin_pred)
+        return lin_pred ** (1 / (1 - self.power))
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return _asanyarray(lin_pred)
+    @catch_p
+    def inverse_derivative(self, lin_pred):  # noqa D
+        if self.power == 0:
+            return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
+        if self.power == 1:
+            return np.exp(lin_pred)
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        return (1 / (1 - self.power)) * lin_pred ** (self.power / (1 - self.power))
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
+    @catch_p
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        if self.power == 0:
+            return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
+        if self.power == 1:
+            return np.exp(lin_pred)
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
+        result = lin_pred ** ((2 * self.power - 1) / (1 - self.power))
+        result *= self.power / (1 - self.power) ** 2
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
+        return result
 
 
-class LogLink(Link):
-    """The log link function ``log(x)``."""
+class IdentityLink(Link):
+    """The identity link function."""
 
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def link(self, mu):
-        """Get the log of ``mu``.
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieLink(0)
 
-        See superclass documentation.
+    def link(self, mu):  # noqa D
+        return mu
 
-        Parameters
-        ----------
-        mu: array-like
+    def derivative(self, mu):  # noqa D
+        return 1 if np.isscalar(mu) else np.ones_like(mu)
 
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return np.log(_asanyarray(mu))
+    def inverse(self, lin_pred):  # noqa D
+        return lin_pred
 
-    def derivative(self, mu):
-        """Get the derivative of the log link, one over ``mu``.
+    def inverse_derivative(self, lin_pred):  # noqa D
+        return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
 
-        Parameters
-        ----------
-        mu: array-like
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
 
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return 1 / _asanyarray(mu)
 
-    def inverse(self, lin_pred):
-        """Get the inverse of the log link, the exponential function.
+class LogLink(Link):
+    """The log link function ``log(x)``."""
 
-        See superclass documentation.
+    def __eq__(self, other):  # noqa D
+        return isinstance(other, self.__class__)
 
-        Parameters
-        ----------
-        lin_pred: array-like
+    def __tweedie_repr__(self):  # noqa D
+        return TweedieLink(1)
 
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return np.exp(_asanyarray(lin_pred))
+    def link(self, mu):  # noqa D
+        return np.log(mu)
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+    def derivative(self, mu):  # noqa D
+        return 1 / mu
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return np.exp(_asanyarray(lin_pred))
+    def inverse(self, lin_pred):  # noqa D
+        return np.exp(lin_pred)
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
+    def inverse_derivative(self, lin_pred):  # noqa D
+        return np.exp(lin_pred)
 
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        return np.exp(_asanyarray(lin_pred))
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        return np.exp(lin_pred)
 
 
 class LogitLink(Link):
@@ -211,289 +227,59 @@ class LogitLink(Link):
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def link(self, mu):
-        """Get the logit function of ``mu``.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        numpy.ndarray
-        """
-        return special.logit(_asanyarray(mu))
+    def link(self, mu):  # noqa D
+        return special.logit(mu)
 
-    def derivative(self, mu):
-        """Get the derivative of the logit link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        mu = _asanyarray(mu)
+    def derivative(self, mu):  # noqa D
         return 1.0 / (mu * (1 - mu))
 
-    def inverse(self, lin_pred):
-        """Get the inverse of the logit link.
-
-        See superclass documentation.
+    def inverse(self, lin_pred):  # noqa D
 
-        Note: since passing a very large value might result in an output of one,
-        this function bounds the output to be between ``[50*eps, 1 - 50*eps]``,
-        where ``eps`` is floating point epsilon.
-
-        Parameters
-        ----------
-        lin_pred: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        inv_logit = special.expit(_asanyarray(lin_pred))
+        inv_logit = special.expit(lin_pred)
         eps50 = 50 * np.finfo(inv_logit.dtype).eps
+
         if np.any(inv_logit > 1 - eps50) or np.any(inv_logit < eps50):
-            warnings.warn(
-                "Computing sigmoid function gave results too close to 0 or 1. Clipping."
-            )
+            warnings.warn("Sigmoid function too close to 0 or 1. Clipping.")
             return np.clip(inv_logit, eps50, 1 - eps50)
-        return inv_logit
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        return inv_logit
 
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        ep = special.expit(_asanyarray(lin_pred))
+    def inverse_derivative(self, lin_pred):  # noqa D
+        ep = special.expit(lin_pred)
         return ep * (1.0 - ep)
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        ep = special.expit(_asanyarray(lin_pred))
+    def inverse_derivative2(self, lin_pred):  # noqa D
+        ep = special.expit(lin_pred)
         return ep * (1.0 - ep) * (1.0 - 2 * ep)
 
 
-def catch_p(fun):
-    """
-    Decorate ``fun``, ensuring that the given linear predictor is compatible with the \
-    relevant Tweedie power parameter.
-
-    Parameters
-    ----------
-    fun: TweedieLink method
-
-    Returns
-    -------
-    Callable
-    """
-
-    def _to_return(*args, **kwargs):
-        with np.errstate(invalid="raise"):
-            try:
-                result = fun(*args, **kwargs)
-            except FloatingPointError as e:
-                raise ValueError(
-                    f"Your linear predictors are not supported for p={args[0].p}. For "
-                    + "negative linear predictors, consider using a log link instead."
-                ) from e
-        return result
-
-    return _to_return
-
-
-class TweedieLink(Link):
-    """The Tweedie link function ``x^(1-p)`` if ``p≠1`` and ``log(x)`` if ``p=1``."""
-
-    def __init__(self, p):
-        self.p = p
-
-    def __eq__(self, other):  # noqa D
-        return isinstance(other, self.__class__) and (self.p == other.p)
-
-    def link(self, mu):
-        """Get the Tweedie canonical link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-        """
-        if self.p == 0:
-            return _asanyarray(mu)
-        if self.p == 1:
-            return np.log(_asanyarray(mu))
-        return _asanyarray(mu) ** (1 - self.p)
-
-    def derivative(self, mu):
-        """Get the derivative of the Tweedie link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-        """
-        if self.p == 0:
-            return 1.0 if np.isscalar(mu) else np.ones_like(mu)
-        if self.p == 1:
-            return 1 / _asanyarray(mu)
-        return (1 - self.p) * _asanyarray(mu) ** (-self.p)
-
-    @catch_p
-    def inverse(self, lin_pred):
-        """Get the inverse of the Tweedie link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-        """
-        if self.p == 0:
-            return _asanyarray(lin_pred)
-        if self.p == 1:
-            return np.exp(_asanyarray(lin_pred))
-        return _asanyarray(lin_pred) ** (1 / (1 - self.p))
-
-    @catch_p
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse Tweedie link function ``h'(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array-like, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        if self.p == 0:
-            return 1.0 if np.isscalar(lin_pred) else np.ones_like(lin_pred)
-        if self.p == 1:
-            return np.exp(_asanyarray(lin_pred))
-        return (1 / (1 - self.p)) * _asanyarray(lin_pred) ** (self.p / (1 - self.p))
-
-    @catch_p
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse Tweedie link function \
-            ``h''(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        if self.p == 0:
-            return 0.0 if np.isscalar(lin_pred) else np.zeros_like(lin_pred)
-        if self.p == 1:
-            return np.exp(_asanyarray(lin_pred))
-
-        result = _asanyarray(lin_pred) ** ((2 * self.p - 1) / (1 - self.p))
-        result *= self.p / (1 - self.p) ** 2
-
-        return result
-
-
 class CloglogLink(Link):
     """The complementary log-log link function ``log(-log(-p))``."""
 
     def __eq__(self, other):  # noqa D
         return isinstance(other, self.__class__)
 
-    def link(self, mu):
-        """Get the logit function of ``mu``.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        numpy.ndarray
-        """
-        mu = _asanyarray(mu)
+    def link(self, mu):  # noqa D
         return np.log(-np.log1p(-mu))
 
-    def derivative(self, mu):
-        """Get the derivative of the cloglog link.
-
-        See superclass documentation.
-
-        Parameters
-        ----------
-        mu: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        mu = _asanyarray(mu)
+    def derivative(self, mu):  # noqa D
         return 1.0 / ((mu - 1) * (np.log1p(-mu)))
 
-    def inverse(self, lin_pred):
-        """Get the inverse of the cloglog link.
-
-        See superclass documentation.
-
-        Note: since passing a very large value might result in an output of one,
-        this function bounds the output to be between ``[50*eps, 1 - 50*eps]``,
-        where ``eps`` is floating point epsilon.
+    def inverse(self, lin_pred):  # noqa D
 
-        Parameters
-        ----------
-        lin_pred: array-like
-
-        Returns
-        -------
-        array-like
-        """
-        lin_pred = _asanyarray(lin_pred)
+        lin_pred = lin_pred
         inv_cloglog = -np.expm1(-np.exp(lin_pred))
         eps50 = 50 * np.finfo(inv_cloglog.dtype).eps
+
         if np.any(inv_cloglog > 1 - eps50) or np.any(inv_cloglog < eps50):
-            warnings.warn(
-                "Computing sigmoid function gave results too close to 0 or 1. Clipping."
-            )
+            warnings.warn("Sigmoid function too close to 0 or 1. Clipping.")
             return np.clip(inv_cloglog, eps50, 1 - eps50)
-        return inv_cloglog
 
-    def inverse_derivative(self, lin_pred):
-        """Compute the derivative of the inverse link function ``h'(lin_pred)``.
+        return inv_cloglog
 
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        lin_pred = _asanyarray(lin_pred)
+    def inverse_derivative(self, lin_pred):  # noqa D
         return np.exp(lin_pred - np.exp(lin_pred))
 
-    def inverse_derivative2(self, lin_pred):
-        """Compute second derivative of the inverse link function ``h''(lin_pred)``.
-
-        Parameters
-        ----------
-        lin_pred : array, shape (n_samples,)
-            Usually the (fitted) linear predictor.
-        """
-        lin_pred = _asanyarray(lin_pred)
+    def inverse_derivative2(self, lin_pred):  # noqa D
         # TODO: check if numerical stability can be improved
         return np.exp(np.exp(lin_pred) - lin_pred) * np.expm1(lin_pred)
diff --git a/src/glum/_util.py b/src/glum/_util.py
index 294815edb..83d30fe06 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -1,5 +1,6 @@
 import logging
 import warnings
+from collections.abc import Sequence
 from functools import wraps
 from typing import Union
 
@@ -16,7 +17,9 @@ def _asanyarray(x, **kwargs):
     return x if pd.api.types.is_scalar(x) else np.asanyarray(x, **kwargs)
 
 
-def _align_df_categories(df, dtypes) -> pd.DataFrame:
+def _align_df_categories(
+    df, dtypes, has_missing_category, cat_missing_method
+) -> pd.DataFrame:
     """Align data types for prediction.
 
     This function checks that categorical columns have same categories in the
@@ -27,6 +30,8 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     ----------
     df : pandas.DataFrame
     dtypes : Dict[str, Union[str, type, pandas.core.dtypes.base.ExtensionDtype]]
+    has_missing_category : Dict[str, bool]
+    missing_method : str
     """
     if not isinstance(df, pd.DataFrame):
         raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
@@ -48,6 +53,58 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
             changed_dtypes[column] = df[column].cat.set_categories(
                 dtypes[column].categories
             )
+        else:
+            continue
+
+        if cat_missing_method == "convert" and not has_missing_category[column]:
+            unseen_categories = set(df[column].unique())
+            unseen_categories = unseen_categories - set(dtypes[column].categories)
+        else:
+            unseen_categories = set(df[column].dropna().unique())
+            unseen_categories = unseen_categories - set(dtypes[column].categories)
+
+        if unseen_categories:
+            raise ValueError(
+                f"Column {column} contains unseen categories: {unseen_categories}."
+            )
+
+    if changed_dtypes:
+        df = df.assign(**changed_dtypes)
+
+    return df
+
+
+def _add_missing_categories(
+    df,
+    dtypes,
+    feature_names: Sequence[str],
+    categorical_format: str,
+    cat_missing_name: str,
+) -> pd.DataFrame:
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError(f"Expected `pandas.DataFrame'; got {type(df)}.")
+
+    changed_dtypes = {}
+
+    categorical_dtypes = [
+        column
+        for column, dtype in dtypes.items()
+        if isinstance(dtype, pd.CategoricalDtype) and (column in df)
+    ]
+
+    for column in categorical_dtypes:
+        if (
+            categorical_format.format(name=column, category=cat_missing_name)
+            in feature_names
+        ):
+            if cat_missing_name in df[column].cat.categories:
+                raise ValueError(
+                    f"Missing category {cat_missing_name} already exists in {column}."
+                )
+            _logger.info(f"Adding missing category {cat_missing_name} to {column}.")
+            changed_dtypes[column] = df[column].cat.add_categories(cat_missing_name)
+            if df[column].isnull().any():
+                changed_dtypes[column] = changed_dtypes[column].fillna(cat_missing_name)
 
     if changed_dtypes:
         df = df.assign(**changed_dtypes)
@@ -55,6 +112,55 @@ def _align_df_categories(df, dtypes) -> pd.DataFrame:
     return df
 
 
+def _expand_categorical_penalties(
+    penalty, X, drop_first, has_missing_category
+) -> np.ndarray:
+    """Determine penalty matrices ``P1`` or ``P2`` after expanding categorical columns.
+
+    If ``P1`` or ``P2`` has the same shape as ``X`` before expanding categorical
+    columns, we assume that the penalty at the location of categorical columns
+    is the same for all levels.
+    """
+    if isinstance(penalty, str):
+        return penalty
+    if not sparse.issparse(penalty):
+        penalty = np.asanyarray(penalty)
+
+    if penalty.shape[0] == X.shape[1]:
+
+        if penalty.ndim == 2:
+            raise ValueError(
+                "When the penalty is two-dimensional, it must have the "
+                "same length as the number of columns in the design "
+                "matrix `X` after expanding categorical columns."
+            )
+
+        expanded_penalty = []  # type: ignore
+
+        for element, (column, dt) in zip(penalty, X.dtypes.items()):
+            if isinstance(dt, pd.CategoricalDtype):
+                length = len(dt.categories) + has_missing_category[column] - drop_first
+                expanded_penalty.extend(element for _ in range(length))
+            else:
+                expanded_penalty.append(element)
+
+        return np.array(expanded_penalty)
+
+    else:
+
+        return penalty
+
+
+def _is_contiguous(X) -> bool:
+    if isinstance(X, np.ndarray):
+        return X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]
+    elif isinstance(X, pd.DataFrame):
+        return _is_contiguous(X.values)
+    else:
+        # If not a numpy array or pandas data frame, we assume it is contiguous.
+        return True
+
+
 def _safe_lin_pred(
     X: Union[MatrixBase, StandardizedMatrix],
     coef: np.ndarray,
@@ -88,7 +194,7 @@ def _safe_sandwich_dot(
     """
     result = X.sandwich(d, rows, cols)
     if isinstance(result, sparse.dia_matrix):
-        result = result.A
+        result = result.toarray()
 
     if intercept:
         dim = result.shape[0] + 1
diff --git a/src/glum_benchmarks/data/simulated_glm.py b/src/glum_benchmarks/data/simulated_glm.py
index 19a3175d0..9d5c087cb 100644
--- a/src/glum_benchmarks/data/simulated_glm.py
+++ b/src/glum_benchmarks/data/simulated_glm.py
@@ -104,7 +104,7 @@ def simulate_glm_data(
 
     # Creating sparse component
     sparse_feature_names = [f"sparse{i}" for i in range(sparse_features)]
-    X_sparse = sps.random(n_rows, sparse_features, density=sparse_density).A
+    X_sparse = sps.random(n_rows, sparse_features, density=sparse_density).toarray()
     X_sparse = pd.DataFrame(data=X_sparse, columns=sparse_feature_names)
     coefs_sparse = rand.choice([0, 1, -1], size=sparse_features)
     coefs_sparse = pd.Series(data=coefs_sparse, index=sparse_feature_names)
diff --git a/tests/glm/golden_master/simulation_gm.json b/tests/glm/golden_master/simulation_gm.json
index 3c69f59bf..660413de6 100644
--- a/tests/glm/golden_master/simulation_gm.json
+++ b/tests/glm/golden_master/simulation_gm.json
@@ -1,6 +1,6 @@
 {
   "normal": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.5027665204024282,
         0.23449539956055546,
@@ -36,7 +36,7 @@
       "intercept_": 3.026490229054092,
       "n_iter_": 1
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.5012056522046088,
         0.23528722263235485,
@@ -72,7 +72,7 @@
       "intercept_": 2.0279948791150764,
       "n_iter_": 1
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.49784759015593427,
         0.23166926058137094,
@@ -108,7 +108,7 @@
       "intercept_": 2.981778440705444,
       "n_iter_": 1
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.4985676422254175,
         0.22818569911229844,
@@ -1478,7 +1478,7 @@
     }
   },
   "poisson": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.9604408672344522,
         0.4432562524921413,
@@ -1514,7 +1514,7 @@
       "intercept_": 1.8189178943867188,
       "n_iter_": 6
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.9817372866211753,
         0.49117907395980553,
@@ -1550,7 +1550,7 @@
       "intercept_": 1.157828764208921,
       "n_iter_": 6
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.9693196874148616,
         0.46707910961062293,
@@ -1586,7 +1586,7 @@
       "intercept_": 1.8396971485658087,
       "n_iter_": 6
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.9821298947770232,
         0.4937841900606277,
@@ -2812,7 +2812,7 @@
     }
   },
   "gamma": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.4866808417045077,
         0.1370793228217412,
@@ -2848,7 +2848,7 @@
       "intercept_": 5.268950639816242,
       "n_iter_": 4
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.48972345202083134,
         0.24707128799109493,
@@ -2884,7 +2884,7 @@
       "intercept_": 2.512993119536852,
       "n_iter_": 4
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.5107634971640694,
         0.1783139942111257,
@@ -2920,7 +2920,7 @@
       "intercept_": 5.272870219406924,
       "n_iter_": 4
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.4966531683982075,
         0.24896254652599858,
@@ -4146,7 +4146,7 @@
     }
   },
   "tweedie_p=1.5": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.8740584736837378,
         0.39026903329437757,
@@ -4182,7 +4182,7 @@
       "intercept_": 2.8380327257627473,
       "n_iter_": 4
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.8592854961617753,
         0.42694459825027725,
@@ -4218,7 +4218,7 @@
       "intercept_": 1.6496674803774887,
       "n_iter_": 4
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.8763610403720393,
         0.4023951463085115,
@@ -4254,7 +4254,7 @@
       "intercept_": 2.7855262434295343,
       "n_iter_": 4
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.860178238544325,
         0.43000049156945763,
@@ -5480,7 +5480,7 @@
     }
   },
   "binomial": {
-    "default_weights_offset": {
+    "regularization_weights_offset": {
       "coef_": [
         0.0645115293284631,
         0.03563706184469416,
@@ -5516,7 +5516,7 @@
       "intercept_": 3.3761974509366994,
       "n_iter_": 3
     },
-    "default_weights": {
+    "regularization_weights": {
       "coef_": [
         0.06396142685405831,
         0.03544619397195947,
@@ -5552,7 +5552,7 @@
       "intercept_": 2.007458821879875,
       "n_iter_": 2
     },
-    "default_offset": {
+    "regularization_offset": {
       "coef_": [
         0.059850128940604715,
         0.029620907232596274,
@@ -5588,7 +5588,7 @@
       "intercept_": 3.4202998674202676,
       "n_iter_": 3
     },
-    "default": {
+    "regularization": {
       "coef_": [
         0.05979957149348005,
         0.03233408720147587,
diff --git a/tests/glm/test_cv_glm.py b/tests/glm/test_cv_glm.py
index 605039ff9..c8dd2923e 100644
--- a/tests/glm/test_cv_glm.py
+++ b/tests/glm/test_cv_glm.py
@@ -48,8 +48,8 @@ def test_normal_elastic_net_comparison(l1_ratio, fit_intercept, convert_x_fn):
     y = y[0:n_samples]
     X, T = X[0:n_samples], X[n_samples:]
 
-    x_arr = X if isinstance(X, np.ndarray) else X.A
-    t_arr = T if isinstance(T, np.ndarray) else T.A
+    x_arr = X if isinstance(X, np.ndarray) else X.toarray()
+    t_arr = T if isinstance(T, np.ndarray) else T.toarray()
     elastic_net = ElasticNetCV(
         l1_ratio=l1_ratio,
         n_alphas=n_alphas,
diff --git a/tests/glm/test_distribution.py b/tests/glm/test_distribution.py
index 6ac2eef5e..e58fd4c04 100644
--- a/tests/glm/test_distribution.py
+++ b/tests/glm/test_distribution.py
@@ -56,9 +56,9 @@ def test_family_bounds(family, expected):
 def test_tweedie_distribution_power():
     with pytest.raises(ValueError, match="no distribution exists"):
         TweedieDistribution(power=0.5)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         TweedieDistribution(power=1j)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         dist = TweedieDistribution()
         dist.power = 1j
 
@@ -89,11 +89,11 @@ def test_tweedie_distribution_parsing():
 
 
 def test_negative_binomial_distribution_alpha():
-    with pytest.raises(ValueError, match="must be strictly positive number"):
+    with pytest.raises(ValueError, match="must be strictly positive"):
         NegativeBinomialDistribution(theta=-0.5)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         NegativeBinomialDistribution(theta=1j)
-    with pytest.raises(TypeError, match="must be an int or float"):
+    with pytest.raises(TypeError, match="must be numeric"):
         dist = NegativeBinomialDistribution()
         dist.theta = 1j
 
@@ -119,16 +119,24 @@ def test_negative_binomial_distribution_parsing():
 
 
 def test_equality():
-    assert TweedieDistribution(1) == TweedieDistribution(1)
-    assert TweedieDistribution(1) == PoissonDistribution()
-    assert TweedieDistribution(2) == GammaDistribution()
-    assert PoissonDistribution() == PoissonDistribution()
-    assert TweedieDistribution(1) != TweedieDistribution(1.5)
-    assert TweedieDistribution(1) != BinomialDistribution()
     assert BinomialDistribution() == BinomialDistribution()
-    assert NegativeBinomialDistribution(1) == NegativeBinomialDistribution(1)
-    assert NegativeBinomialDistribution(1) != NegativeBinomialDistribution(1.5)
+    assert GammaDistribution() == GammaDistribution()
     assert NegativeBinomialDistribution(1) != BinomialDistribution()
+    assert NegativeBinomialDistribution(1) != NegativeBinomialDistribution(1.5)
+    assert NegativeBinomialDistribution(1) == NegativeBinomialDistribution(1)
+    assert NormalDistribution() == NormalDistribution()
+    assert PoissonDistribution() == PoissonDistribution()
+    assert TweedieDistribution(0) != NormalDistribution()
+    assert TweedieDistribution(0) == NormalDistribution().to_tweedie()
+    assert TweedieDistribution(1) != BinomialDistribution()
+    assert TweedieDistribution(1) != PoissonDistribution()
+    assert TweedieDistribution(1) == PoissonDistribution().to_tweedie()
+    assert TweedieDistribution(1) != TweedieDistribution(1.5)
+    assert TweedieDistribution(1) == TweedieDistribution(1)
+    assert TweedieDistribution(2) != GammaDistribution()
+    assert TweedieDistribution(2) == GammaDistribution().to_tweedie()
+    assert TweedieDistribution(3) != InverseGaussianDistribution()
+    assert TweedieDistribution(3) == InverseGaussianDistribution().to_tweedie()
 
 
 @pytest.mark.parametrize(
@@ -234,7 +242,7 @@ def test_hessian_matrix(family, link, true_hessian):
     dispersion = 0.5
     rng = np.random.RandomState(42)
     X = tm.DenseMatrix(rng.randn(10, 5))
-    lin_pred = np.dot(X, coef)
+    lin_pred = X.matvec(coef)
     mu = link.inverse(lin_pred)
     sample_weight = rng.randn(10) ** 2 + 1
     _, hessian_rows = family.rowwise_gradient_hessian(
@@ -257,7 +265,7 @@ def test_hessian_matrix(family, link, true_hessian):
     for i in range(coef.shape[0]):
 
         def f(coef):
-            this_eta = X.dot(coef)
+            this_eta = X.matvec(coef)
             this_mu = link.inverse(this_eta)
             yv = mu
             if true_hessian:
@@ -296,7 +304,6 @@ def test_poisson_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -7.390977 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="poisson",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -345,7 +352,6 @@ def test_gamma_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -7.057068 (df=2)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="gamma",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -393,7 +399,6 @@ def test_gaussian_deviance_dispersion_loglihood(family, weighted):
     # logLik(glm_model)  # -7.863404 (df=2)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family=family,
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -441,7 +446,6 @@ def test_tweedie_deviance_dispersion_loglihood(weighted):
     # logLiktweedie(glm_model)  # -8.35485
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family=TweedieDistribution(1.5),
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -490,7 +494,6 @@ def test_binomial_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -3.365058 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="binomial",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -535,7 +538,6 @@ def test_negative_binomial_deviance_dispersion_loglihood(weighted):
     # logLik(glm_model)  # -4.187887 (df=1)
 
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="negative.binomial",
         fit_intercept=False,
         gradient_tol=1e-8,
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 20f6bd876..c4dcdd222 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -5,11 +5,14 @@
 import warnings
 from typing import Any, Optional, Union
 
+import formulaic
 import numpy as np
 import pandas as pd
 import pytest
 import statsmodels.api as sm
+import statsmodels.formula.api as smf
 import tabmat as tm
+from formulaic import Formula
 from numpy.testing import assert_allclose
 from scipy import optimize, sparse
 from sklearn.base import clone
@@ -33,7 +36,12 @@
     TweedieDistribution,
     guess_intercept,
 )
-from glum._glm import GeneralizedLinearRegressor, _unstandardize, is_pos_semidef
+from glum._glm import (
+    GeneralizedLinearRegressor,
+    _parse_formula,
+    _unstandardize,
+    is_pos_semidef,
+)
 from glum._link import IdentityLink, Link, LogitLink, LogLink
 
 GLM_SOLVERS = ["irls-ls", "lbfgs", "irls-cd", "trust-constr"]
@@ -45,7 +53,7 @@
 
 
 def get_small_x_y(
-    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
+    estimator: Union[GeneralizedLinearRegressor, GeneralizedLinearRegressorCV],
 ) -> tuple[np.ndarray, np.ndarray]:
     if isinstance(estimator, GeneralizedLinearRegressor):
         n_rows = 1
@@ -195,7 +203,7 @@ def test_gradient_tol_setting(estimator, kwargs, solver, gradient_tol):
 )
 def test_glm_family_argument(f, fam, y, X):
     """Test GLM family argument set as string."""
-    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    glm = GeneralizedLinearRegressor(family=f).fit(X, y)
     assert isinstance(glm._family_instance, fam.__class__)
 
 
@@ -354,6 +362,46 @@ def test_P1_P2_expansion_with_categoricals():
     np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
 
 
+def test_P1_P2_expansion_with_categoricals_missings():
+    rng = np.random.default_rng(42)
+    X = pd.DataFrame(
+        data={
+            "dense": np.linspace(0, 10, 60),
+            "cat": pd.Categorical(rng.integers(5, size=60)).remove_categories(0),
+        }
+    )
+    y = rng.normal(size=60)
+
+    mdl1 = GeneralizedLinearRegressor(
+        alpha=1.0,
+        l1_ratio=0.01,
+        P1=[1, 2, 2, 2, 2, 2],
+        P2=[2, 1, 1, 1, 1, 1],
+        cat_missing_method="convert",
+    )
+    mdl1.fit(X, y)
+
+    mdl2 = GeneralizedLinearRegressor(
+        alpha=1.0,
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=[2, 1],
+        cat_missing_method="convert",
+    )
+    mdl2.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl2.coef_)
+
+    mdl3 = GeneralizedLinearRegressor(
+        alpha=1.0,
+        l1_ratio=0.01,
+        P1=[1, 2],
+        P2=sparse.diags([2, 1, 1, 1, 1, 1]),
+        cat_missing_method="convert",
+    )
+    mdl3.fit(X, y)
+    np.testing.assert_allclose(mdl1.coef_, mdl3.coef_)
+
+
 @pytest.mark.parametrize(
     "estimator", [GeneralizedLinearRegressor, GeneralizedLinearRegressorCV]
 )
@@ -382,7 +430,10 @@ def test_glm_fit_intercept_argument(estimator, fit_intercept):
 )
 def test_glm_solver_argument(estimator, solver, l1_ratio, y, X):
     """Test GLM for invalid solver argument."""
-    glm = estimator(solver=solver, l1_ratio=l1_ratio)
+    kwargs = {"solver": solver, "l1_ratio": l1_ratio}
+    if estimator == GeneralizedLinearRegressor:
+        kwargs["alpha"] = 1.0
+    glm = estimator(**kwargs)
     with pytest.raises(ValueError):
         glm.fit(X, y)
 
@@ -431,7 +482,10 @@ def test_glm_warm_start_argument(estimator, warm_start):
 def test_glm_warm_start_with_constant_column(estimator):
     X, y = make_regression()
     X[:, 0] = 0
-    glm = estimator(warm_start=True)
+    kwargs = {"warm_start": True}
+    if estimator == GeneralizedLinearRegressor:
+        kwargs["alpha"] = 1.0
+    glm = estimator(**kwargs)
     glm.fit(X, y)
     glm.fit(X, y)
 
@@ -530,7 +584,6 @@ def test_glm_identity_regression(solver, fit_intercept, offset, convert_x_fn):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef) + (0 if offset is None else offset)
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=fit_intercept,
@@ -563,7 +616,9 @@ def test_get_diagnostics(
     glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept, solver=solver)
     res = glm.fit(X, y)
 
-    diagnostics = res.get_formatted_diagnostics(full_report, custom_columns)
+    diagnostics = res.get_formatted_diagnostics(
+        full_report=full_report, custom_columns=custom_columns
+    )
     if solver in ("lbfgs", "trust-constr"):
         assert diagnostics == "solver does not report diagnostics"
     else:
@@ -625,7 +680,7 @@ def test_report_diagnostics(
 
     f = io.StringIO()
     with redirect_stdout(f):
-        res.report_diagnostics(full_report, custom_columns)
+        res.report_diagnostics(full_report=full_report, custom_columns=custom_columns)
     printed = f.getvalue()
     # Something should be printed
     assert len(printed) > 0
@@ -650,7 +705,6 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.dot(X, coef) + (0 if offset is None else offset)
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=fit_intercept,
@@ -667,7 +721,7 @@ def test_x_not_modified_inplace(solver, fit_intercept, offset, convert_x_fn):
     if isinstance(X, np.ndarray):
         np.testing.assert_almost_equal(X, X_before)
     else:
-        np.testing.assert_almost_equal(X.A, X_before.A)
+        np.testing.assert_almost_equal(X.toarray(), X_before.toarray())
 
 
 @pytest.mark.parametrize("solver", GLM_SOLVERS)
@@ -692,7 +746,6 @@ def test_glm_identity_regression_categorical_data(solver, offset, convert_x_fn):
     y = np.dot(x_mat, coef) + (0 if offset is None else offset)
 
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         link="identity",
         fit_intercept=False,
@@ -700,7 +753,7 @@ def test_glm_identity_regression_categorical_data(solver, offset, convert_x_fn):
         gradient_tol=1e-7,
     )
     X = convert_x_fn(x_mat)
-    np.testing.assert_almost_equal(X.A if hasattr(X, "A") else X, x_mat)
+    np.testing.assert_almost_equal(X.toarray() if hasattr(X, "toarray") else X, x_mat)
     res = glm.fit(X, y, offset=offset)
 
     assert_allclose(res.coef_, coef, rtol=1e-6)
@@ -731,7 +784,6 @@ def test_glm_log_regression(family, solver, tol, fit_intercept, offset):
     X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
     y = np.exp(np.dot(X, coef) + (0 if offset is None else offset))
     glm = GeneralizedLinearRegressor(
-        alpha=0,
         family=family,
         link="log",
         fit_intercept=fit_intercept,
@@ -1205,7 +1257,6 @@ def test_binomial_cloglog_unregularized(solver):
     sm_fit = sm_glm.fit()
 
     glum_glm = GeneralizedLinearRegressor(
-        alpha=0,
         family="binomial",
         link="cloglog",
         solver=solver,
@@ -1267,11 +1318,11 @@ def test_binomial_enet(alpha):
 @pytest.mark.parametrize(
     "params",
     [
-        {"solver": "irls-ls"},
-        {"solver": "lbfgs"},
-        {"solver": "trust-constr"},
-        {"solver": "irls-cd", "selection": "cyclic"},
-        {"solver": "irls-cd", "selection": "random"},
+        {"solver": "irls-ls", "alpha": 1.0},
+        {"solver": "lbfgs", "alpha": 1.0},
+        {"solver": "trust-constr", "alpha": 1.0},
+        {"solver": "irls-cd", "selection": "cyclic", "alpha": 1.0},
+        {"solver": "irls-cd", "selection": "random", "alpha": 1.0},
     ],
     ids=lambda params: ", ".join(f"{key}={val}" for key, val in params.items()),
 )
@@ -1283,7 +1334,7 @@ def test_solver_equivalence(params, use_offset, regression_data):
         offset = np.random.random(len(y))
     else:
         offset = None
-    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref = GeneralizedLinearRegressor(random_state=2, alpha=1.0)
     est_ref.fit(X, y, offset=offset)
 
     est_2 = GeneralizedLinearRegressor(**params)
@@ -1398,24 +1449,24 @@ def _arrays_share_data(arr1: np.ndarray, arr2: np.ndarray) -> bool:
         assert _arrays_share_data(X.mat.indptr, M.indptr)
     else:
         # Check that the underlying data pointer is the same
-        assert _arrays_share_data(X.mat, M)
+        assert _arrays_share_data(X.mat.unpack(), M.unpack())
     np.testing.assert_almost_equal(col_means, col_mults)
 
     # After standardization, all the columns will have the same values.
     # To check that, just convert to dense first.
     if use_sparse:
-        Xdense = X.A
+        Xdense = X.toarray()
     else:
         Xdense = X
     for i in range(1, NC):
         if scale_predictors:
             if isinstance(Xdense, tm.StandardizedMatrix):
-                one, two = Xdense.A[:, 0], Xdense.A[:, i]
+                one, two = Xdense.toarray()[:, 0], Xdense.toarray()[:, i]
             else:
                 one, two = Xdense[:, 0], Xdense[:, i]
         else:
             if isinstance(Xdense, tm.StandardizedMatrix):
-                one, two = (i + 1) * Xdense.A[:, 0], Xdense.A[:, i]
+                one, two = (i + 1) * Xdense.toarray()[:, 0], Xdense.toarray()[:, i]
             else:
                 one, two = (i + 1) * Xdense[:, 0], Xdense[:, i]
         np.testing.assert_almost_equal(one, two)
@@ -1720,7 +1771,7 @@ def test_passing_noncontiguous_as_X():
     "X, feature_names",
     [
         (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
-        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), None),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), ["_col_0", "_col_1"]),
         (
             pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
             np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2"]),
@@ -1734,13 +1785,122 @@ def test_passing_noncontiguous_as_X():
             ),
             np.array(["x1__0", "x1__1", "x1__2", "x1__3", "x1__4", "x2__2"]),
         ),
+        (
+            tm.SplitMatrix(
+                [
+                    tm.CategoricalMatrix(
+                        np.arange(5), column_name_format="{name}__{category}"
+                    ),
+                    tm.DenseMatrix(np.ones((5, 1))),
+                ]
+            ),
+            np.array(
+                [
+                    "_col_0-4__0",
+                    "_col_0-4__1",
+                    "_col_0-4__2",
+                    "_col_0-4__3",
+                    "_col_0-4__4",
+                    "_col_5",
+                ]
+            ),
+        ),
     ],
 )
-def test_feature_names(X, feature_names):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+def test_feature_names_underscores(X, feature_names):
+    model = GeneralizedLinearRegressor(
+        family="poisson", categorical_format="{name}__{category}", alpha=1.0
+    ).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
 
 
+@pytest.mark.parametrize(
+    "X, feature_names",
+    [
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), ["_col_0", "_col_1"]),
+        (
+            pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
+            np.array(["x1[0]", "x1[1]", "x1[2]", "x1[3]", "x1[4]", "x2"]),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "x1": pd.Categorical(np.arange(5)),
+                    "x2": pd.Categorical([2, 2, 2, 2, 2]),
+                }
+            ),
+            np.array(["x1[0]", "x1[1]", "x1[2]", "x1[3]", "x1[4]", "x2[2]"]),
+        ),
+        (
+            tm.SplitMatrix(
+                [
+                    tm.CategoricalMatrix(
+                        np.arange(5), column_name_format="{name}[{category}]"
+                    ),
+                    tm.DenseMatrix(np.ones((5, 1))),
+                ]
+            ),
+            np.array(
+                [
+                    "_col_0-4[0]",
+                    "_col_0-4[1]",
+                    "_col_0-4[2]",
+                    "_col_0-4[3]",
+                    "_col_0-4[4]",
+                    "_col_5",
+                ]
+            ),
+        ),
+    ],
+)
+def test_feature_names_brackets(X, feature_names):
+    model = GeneralizedLinearRegressor(
+        family="poisson", categorical_format="{name}[{category}]", alpha=1.0
+    ).fit(X, np.arange(5))
+    np.testing.assert_array_equal(getattr(model, "feature_names_", None), feature_names)
+
+
+@pytest.mark.parametrize(
+    "X, term_names",
+    [
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}), np.array(["x1", "x2"])),
+        (pd.DataFrame({"x1": np.arange(5), "x2": 2}).to_numpy(), ["_col_0", "_col_1"]),
+        (
+            pd.DataFrame({"x1": pd.Categorical(np.arange(5)), "x2": 2}),
+            np.array(["x1", "x1", "x1", "x1", "x1", "x2"]),
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "x1": pd.Categorical(np.arange(5)),
+                    "x2": pd.Categorical([2, 2, 2, 2, 2]),
+                }
+            ),
+            np.array(["x1", "x1", "x1", "x1", "x1", "x2"]),
+        ),
+        (
+            tm.SplitMatrix(
+                [tm.CategoricalMatrix(np.arange(5)), tm.DenseMatrix(np.ones((5, 1)))]
+            ),
+            np.array(
+                [
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_0-4",
+                    "_col_5",
+                ]
+            ),
+        ),
+    ],
+)
+def test_term_names(X, term_names):
+    model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5))
+    np.testing.assert_array_equal(getattr(model, "term_names_", None), term_names)
+
+
 @pytest.mark.parametrize(
     "X, dtypes",
     [
@@ -1753,7 +1913,7 @@ def test_feature_names(X, feature_names):
     ],
 )
 def test_feature_dtypes(X, dtypes):
-    model = GeneralizedLinearRegressor(family="poisson").fit(X, np.arange(5))
+    model = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X, np.arange(5))
     np.testing.assert_array_equal(getattr(model, "feature_dtypes_", None), dtypes)
 
 
@@ -1776,12 +1936,12 @@ def test_categorical_types(k, n):
 
     # use categorical types
     X_cat = pd.DataFrame({"group": pd.Categorical(group, categories=categories)})
-    model_cat = GeneralizedLinearRegressor(family="poisson").fit(X_cat, y)
+    model_cat = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_cat, y)
     pred_cat = model_cat.predict(X_cat)
 
     # use one-hot encoding
     X_oh = pd.get_dummies(X_cat, dtype=float)
-    model_oh = GeneralizedLinearRegressor(family="poisson").fit(X_oh, y)
+    model_oh = GeneralizedLinearRegressor(family="poisson", alpha=1.0).fit(X_oh, y)
     pred_oh = model_oh.predict(X_oh)
 
     # check predictions
@@ -1832,7 +1992,7 @@ def test_verbose(regression_data, capsys):
 
 def test_ols_std_errors(regression_data):
     X, y = regression_data
-    mdl = GeneralizedLinearRegressor(alpha=0, family="normal")
+    mdl = GeneralizedLinearRegressor(family="normal")
     mdl.fit(X=X, y=y)
 
     mdl_sm = sm.OLS(endog=y, exog=sm.add_constant(X))
@@ -1875,9 +2035,9 @@ def test_array_std_errors(regression_data, family, fit_intercept):
         sm_family = sm.families.Gaussian()
         dispersion = None
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family=family, fit_intercept=fit_intercept
-    ).fit(X=X, y=y)
+    mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit(
+        X=X, y=y
+    )
 
     if fit_intercept:
         mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family)
@@ -1909,7 +2069,7 @@ def test_array_std_errors(regression_data, family, fit_intercept):
 def test_sparse_std_errors(regression_data):
     X, y = regression_data
     sp_X = sparse.csc_matrix(X)
-    mdl = GeneralizedLinearRegressor(alpha=0, family="normal")
+    mdl = GeneralizedLinearRegressor(family="normal")
     mdl.fit(X=X, y=y)
 
     actual1 = mdl.std_errors(X=sp_X, y=y, robust=False)
@@ -1951,9 +2111,7 @@ def test_inputtype_std_errors(regression_data, categorical, split, fit_intercept
                     tm.CategoricalMatrix(pd.Categorical(group, categories=categories)),
                 ]
             )
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="normal", fit_intercept=fit_intercept
-    )
+    mdl = GeneralizedLinearRegressor(family="normal", fit_intercept=fit_intercept)
     mdl.fit(X=X, y=y)
     if isinstance(X, tm.MatrixBase):
         X_sm = X.toarray()
@@ -1985,14 +2143,14 @@ def test_inputtype_std_errors(regression_data, categorical, split, fit_intercept
 
 
 @pytest.mark.parametrize("fit_intercept", [True, False])
-@pytest.mark.parametrize("significance_level", [0.01, 0.05])
-def test_coef_table(regression_data, fit_intercept, significance_level):
+@pytest.mark.parametrize("confidence_level", [0.95, 0.99])
+def test_coef_table(regression_data, fit_intercept, confidence_level):
     X, y = regression_data
     colnames = ["dog", "cat", "bat", "cow", "eel", "fox", "bee", "owl", "pig", "rat"]
     X_df = pd.DataFrame(X, columns=colnames)
 
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=fit_intercept
+        family="gaussian", fit_intercept=fit_intercept
     ).fit(X=X_df, y=y)
 
     if fit_intercept:
@@ -2003,7 +2161,7 @@ def test_coef_table(regression_data, fit_intercept, significance_level):
 
     # Make the covariance matrices the same to focus on the coefficient table
     mdl.covariance_matrix_ = mdl_sm.fit(cov_type="nonrobust").cov_params()
-    our_table = mdl.coef_table()
+    our_table = mdl.coef_table(confidence_level=confidence_level)
 
     if fit_intercept:
         colnames = ["intercept"] + colnames
@@ -2014,7 +2172,9 @@ def test_coef_table(regression_data, fit_intercept, significance_level):
     np.testing.assert_allclose(our_table["t_value"], fit_sm.tvalues, rtol=1e-8)
     np.testing.assert_allclose(our_table["p_value"], fit_sm.pvalues, atol=1e-8)
     np.testing.assert_allclose(
-        our_table[["ci_lower", "ci_upper"]], fit_sm.conf_int(), rtol=1e-8
+        our_table[["ci_lower", "ci_upper"]],
+        fit_sm.conf_int(alpha=1 - confidence_level),
+        rtol=1e-8,
     )
 
 
@@ -2053,9 +2213,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r):
         sm_family = sm.families.Gaussian()
         dispersion = None
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family=family, fit_intercept=fit_intercept
-    ).fit(X=X, y=y)
+    mdl = GeneralizedLinearRegressor(family=family, fit_intercept=fit_intercept).fit(
+        X=X, y=y
+    )
 
     if fit_intercept:
         mdl_sm = sm.GLM(endog=y, exog=sm.add_constant(X), family=sm_family)
@@ -2127,9 +2287,9 @@ def test_wald_test_matrix(regression_data, family, fit_intercept, R, r):
 def test_wald_test_matrix_public(regression_data, R, r):
     X, y = regression_data
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X, y=y, store_covariance_matrix=True
+    )
 
     assert mdl._wald_test_matrix(R, r) == mdl.wald_test(R=R, r=r)
 
@@ -2150,9 +2310,9 @@ def test_wald_test_matrix_public(regression_data, R, r):
 def test_wald_test_matrix_fixed_cov(regression_data, R, r):
     X, y = regression_data
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=False
-    ).fit(X=X, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=False).fit(
+        X=X, y=y, store_covariance_matrix=True
+    )
     mdl_sm = sm.GLM(endog=y, exog=X, family=sm.families.Gaussian())
 
     # Use the same covariance matrix for both so that we can use tighter tolerances
@@ -2195,9 +2355,9 @@ def test_wald_test_feature_names(regression_data, names, R, r):
     X, y = regression_data
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
-    mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
-    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X_df, y=y, store_covariance_matrix=True
+    )
 
     feature_names_results = mdl._wald_test_feature_names(names, r)
     if r is not None:
@@ -2236,16 +2396,232 @@ def test_wald_test_feature_names_public(regression_data, names, r):
     X, y = regression_data
     X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
 
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True).fit(
+        X=X_df, y=y, store_covariance_matrix=True
+    )
+
+    assert mdl._wald_test_feature_names(names, r) == mdl.wald_test(features=names, r=r)
+
+
+@pytest.mark.parametrize(
+    "names, R, r, r_feat",
+    [
+        pytest.param(["col_1"], np.array([[0, 1] + 5 * [0]]), None, None, id="single"),
+        pytest.param(
+            ["col_1", "col_2"],
+            np.array([[0, 1, 0] + 4 * [0], [0, 0, 1] + 4 * [0]]),
+            None,
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            None,
+            None,
+            id="multifeature",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            [1],
+            [1] * 4,
+            id="rhs_not_zero",
+        ),
+        pytest.param(
+            ["intercept", "col_1"],
+            np.array([[1, 0] + 5 * [0], [0, 1] + 5 * [0]]),
+            [1, 2],
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_term_names(regression_data, names, R, r, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+    X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
+
     mdl = GeneralizedLinearRegressor(
-        alpha=0, family="gaussian", fit_intercept=True
+        family="gaussian", fit_intercept=True, drop_first=True
     ).fit(X=X_df, y=y, store_covariance_matrix=True)
 
-    assert mdl._wald_test_feature_names(names, r) == mdl.wald_test(features=names, r=r)
+    term_names_results = mdl._wald_test_term_names(names, r)
+
+    if r is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
+@pytest.mark.parametrize(
+    "names, R, r, r_feat",
+    [
+        pytest.param(["col_1"], np.array([[0, 1] + 5 * [0]]), None, None, id="single"),
+        pytest.param(
+            ["col_1", "col_2"],
+            np.array([[0, 1, 0] + 4 * [0], [0, 0, 1] + 4 * [0]]),
+            None,
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            None,
+            None,
+            id="multifeature",
+        ),
+        pytest.param(
+            ["term_3"],
+            np.hstack(
+                (
+                    np.zeros((4, 3)),
+                    np.eye(4),
+                )
+            ),
+            [1],
+            [1] * 4,
+            id="rhs_not_zero",
+        ),
+        pytest.param(
+            ["intercept", "col_1"],
+            np.array([[1, 0] + 5 * [0], [0, 1] + 5 * [0]]),
+            [1, 2],
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_term_names_public(regression_data, names, R, r, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+    X_df = X_df[["col_1", "col_2"]].assign(term_3=pd.cut(X_df["col_3"], bins=5))
+
+    mdl = GeneralizedLinearRegressor(
+        family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl.wald_test(terms=names, r=r)
+
+    if r is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
+@pytest.mark.parametrize(
+    "formula, R, r_feat",
+    [
+        pytest.param("col_0 = 0", np.array([[0, 1] + 9 * [0]]), None, id="single"),
+        pytest.param(
+            "col_0 = 0, col_1 = 0",
+            np.array([[0, 1, 0] + 8 * [0], [0, 0, 1] + 8 * [0]]),
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            "intercept = 1, col_0 = 2",
+            np.array([[1, 0] + 9 * [0], [0, 1] + 9 * [0]]),
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_formula(regression_data, formula, R, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+
+    mdl = GeneralizedLinearRegressor(
+        family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl._wald_test_formula(formula)
+
+    if r_feat is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
+
+
+@pytest.mark.parametrize(
+    "formula, R, r_feat",
+    [
+        pytest.param("col_0 = 0", np.array([[0, 1] + 9 * [0]]), None, id="single"),
+        pytest.param(
+            "col_0 = 0, col_1 = 0",
+            np.array([[0, 1, 0] + 8 * [0], [0, 0, 1] + 8 * [0]]),
+            None,
+            id="multiple",
+        ),
+        pytest.param(
+            "col_0 + col_1 = 2 * col_2 - 1",
+            np.array([[0, 1, 1, -2] + 7 * [0]]),
+            [-1],
+            id="combination",
+        ),
+        pytest.param(
+            "intercept = 1, col_0 = 2",
+            np.array([[1, 0] + 9 * [0], [0, 1] + 9 * [0]]),
+            [1, 2],
+            id="intercept",
+        ),
+    ],
+)
+def test_wald_test_formula_public(regression_data, formula, R, r_feat):
+    X, y = regression_data
+    X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+
+    mdl = GeneralizedLinearRegressor(
+        family="gaussian", fit_intercept=True, drop_first=True
+    ).fit(X=X_df, y=y, store_covariance_matrix=True)
+
+    term_names_results = mdl.wald_test(formula=formula)
+
+    if r_feat is not None:
+        r_feat = np.array(r_feat)  # wald_test_matrix expects an optional numpy array
+    matrix_results = mdl._wald_test_matrix(R, r_feat)
+
+    np.testing.assert_equal(
+        term_names_results.test_statistic, matrix_results.test_statistic
+    )
+    np.testing.assert_equal(term_names_results.p_value, matrix_results.p_value)
+    assert term_names_results.df == matrix_results.df
 
 
 def test_wald_test_raise_on_wrong_input(regression_data):
     X, y = regression_data
-    mdl = GeneralizedLinearRegressor(alpha=0, family="gaussian", fit_intercept=True)
+    mdl = GeneralizedLinearRegressor(family="gaussian", fit_intercept=True)
     mdl.fit(X=X, y=y)
 
     with pytest.raises(ValueError):
@@ -2260,7 +2636,6 @@ def test_wald_test_raise_on_wrong_input(regression_data):
 @pytest.mark.parametrize("weighted", [False, True])
 def test_score_method(as_data_frame, offset, weighted):
     regressor = GeneralizedLinearRegressor(
-        alpha=0,
         family="normal",
         fit_intercept=False,
         gradient_tol=1e-8,
@@ -2294,7 +2669,7 @@ def test_score_method(as_data_frame, offset, weighted):
 
 def test_information_criteria(regression_data):
     X, y = regression_data
-    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor = GeneralizedLinearRegressor(family="gaussian")
     regressor.fit(X, y)
 
     llf = regressor.family_instance.log_likelihood(y, regressor.predict(X))
@@ -2310,7 +2685,6 @@ def test_information_criteria(regression_data):
     )
 
 
-@pytest.mark.skip(reason="Skip while future warnings are raised.")
 @pytest.mark.filterwarnings("ignore: There is no")
 def test_information_criteria_raises_correct_warnings_and_errors(regression_data):
     X, y = regression_data
@@ -2350,23 +2724,22 @@ def test_drop_first_allows_alpha_equals_0():
     rng = np.random.default_rng(42)
     y = np.random.normal(size=10)
     X = pd.DataFrame(data={"cat": pd.Categorical(rng.integers(2, size=10))})
-    regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
+    regressor = GeneralizedLinearRegressor(drop_first=True)
     regressor.fit(X, y)
 
-    regressor = GeneralizedLinearRegressor(alpha=0)  # default is False
+    regressor = GeneralizedLinearRegressor()  # default is False
     with pytest.raises(np.linalg.LinAlgError):
         regressor.fit(X, y)
 
 
-def test_error_on_distinct_categorical_column():
+def test_dropping_distinct_categorical_column():
     y = np.random.normal(size=10)
-    X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10))})
-    regressor = GeneralizedLinearRegressor(alpha=0, drop_first=True)
-    with pytest.raises(ValueError):
-        regressor.fit(X, y)
-
-    regressor = GeneralizedLinearRegressor(alpha=0)
+    X = pd.DataFrame(data={"cat": pd.Categorical(np.ones(10)), "num": np.ones(10)})
+    regressor = GeneralizedLinearRegressor(drop_first=True)
     regressor.fit(X, y)
+    assert regressor.coef_.shape == (1,)
+    assert regressor.feature_names_ == ["num"]
+    assert regressor.term_names_ == ["num"]
 
 
 def test_P1_P2_with_drop_first():
@@ -2399,7 +2772,6 @@ def test_store_covariance_matrix(
 
     regressor = GeneralizedLinearRegressor(
         family="gaussian",
-        alpha=0,
         robust=robust,
         expected_information=expected_information,
     )
@@ -2420,10 +2792,52 @@ def test_store_covariance_matrix(
     )
 
 
+@pytest.mark.parametrize(
+    "formula", ["y ~ col_1 + col_2", "col_1 + col_2"], ids=["two-sided", "one-sided"]
+)
+def test_store_covariance_matrix_formula(regression_data, formula):
+    X, y = regression_data
+    df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+
+    if "~" in formula:
+        df["y"] = y
+        y = None
+
+    regressor = GeneralizedLinearRegressor(
+        formula=formula,
+        family="gaussian",
+    )
+    regressor.fit(df, y, store_covariance_matrix=True)
+
+    np.testing.assert_array_almost_equal(
+        regressor.covariance_matrix(df, y),
+        regressor.covariance_matrix(),
+    )
+
+    np.testing.assert_array_almost_equal(
+        regressor.std_errors(df, y),
+        regressor.std_errors(),
+    )
+
+
+def test_store_covariance_matrix_formula_errors(regression_data):
+    X, y = regression_data
+    df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])])
+    formula = "col_1 + col_2"
+
+    regressor = GeneralizedLinearRegressor(
+        formula=formula,
+        family="gaussian",
+    )
+    regressor.fit(df, y)
+    with pytest.raises(ValueError, match="Either X and y must be provided"):
+        regressor.covariance_matrix(df)
+
+
 def test_store_covariance_matrix_errors(regression_data):
     X, y = regression_data
 
-    regressor = GeneralizedLinearRegressor(family="gaussian", alpha=0)
+    regressor = GeneralizedLinearRegressor(family="gaussian")
     regressor.fit(X, y, store_covariance_matrix=False)
 
     with pytest.raises(ValueError, match="Either X and y must be provided"):
@@ -2516,3 +2930,361 @@ def test_store_covariance_matrix_cv(
         new_covariance_matrix,
         stored_covariance_matrix,
     )
+
+
+@pytest.mark.parametrize(
+    "input, expected",
+    [
+        pytest.param(
+            "y ~ x1 + x2",
+            (["y"], ["1", "x1", "x2"]),
+            id="implicit_intercept",
+        ),
+        pytest.param(
+            "y ~ x1 + x2 + 1",
+            (["y"], ["1", "x1", "x2"]),
+            id="explicit_intercept",
+        ),
+        pytest.param(
+            "y ~ x1 + x2 - 1",
+            (["y"], ["x1", "x2"]),
+            id="no_intercept",
+        ),
+        pytest.param(
+            "y ~ ",
+            (["y"], ["1"]),
+            id="empty_rhs",
+        ),
+    ],
+)
+def test_parse_formula(input, expected):
+    lhs_exp, rhs_exp = expected
+    lhs, rhs = _parse_formula(input)
+    assert list(lhs) == lhs_exp
+    assert list(rhs) == rhs_exp
+
+    formula = Formula(input)
+    lhs, rhs = _parse_formula(formula)
+    assert list(lhs) == lhs_exp
+    assert list(rhs) == rhs_exp
+
+
+@pytest.mark.parametrize(
+    "input, error",
+    [
+        pytest.param("y1 + y2 ~ x1 + x2", ValueError, id="multiple_lhs"),
+        pytest.param([["y"], ["x1", "x2"]], TypeError, id="wrong_type"),
+    ],
+)
+def test_parse_formula_invalid(input, error):
+    with pytest.raises(error):
+        _parse_formula(input)
+
+
+@pytest.fixture
+def get_mixed_data():
+    nrow = 10
+    np.random.seed(0)
+    return pd.DataFrame(
+        {
+            "y": np.random.rand(nrow),
+            "x1": np.random.rand(nrow),
+            "x2": np.random.rand(nrow),
+            "c1": np.random.choice(["a", "b", "c"], nrow),
+            "c2": np.random.choice(["d", "e"], nrow),
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    "formula",
+    [
+        pytest.param("y ~ x1 + x2", id="numeric"),
+        pytest.param("y ~ c1", id="categorical"),
+        pytest.param("y ~ c1 * c2", id="categorical_interaction"),
+        pytest.param("y ~ x1 + x2 + c1 + c2", id="numeric_categorical"),
+        pytest.param("y ~ x1 * c1 * c2", id="numeric_categorical_interaction"),
+    ],
+)
+@pytest.mark.parametrize(
+    "drop_first", [True, False], ids=["drop_first", "no_drop_first"]
+)
+@pytest.mark.parametrize(
+    "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
+)
+def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
+    """Model with formula and model with externally constructed model matrix should match."""
+    data = get_mixed_data
+
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=drop_first,
+        formula=formula,
+        fit_intercept=fit_intercept,
+        categorical_format="{name}[T.{category}]",
+        alpha=1.0,
+    ).fit(data)
+
+    if fit_intercept:
+        # full rank check must consider presence of intercept
+        y_ext, X_ext = formulaic.model_matrix(
+            formula, data, ensure_full_rank=drop_first
+        )
+        X_ext = X_ext.drop(columns="Intercept")
+    else:
+        y_ext, X_ext = formulaic.model_matrix(
+            formula + "-1", data, ensure_full_rank=drop_first
+        )
+    y_ext = y_ext.iloc[:, 0]
+
+    model_ext = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=drop_first,
+        fit_intercept=fit_intercept,
+        categorical_format="{name}[T.{category}]",
+        alpha=1.0,
+    ).fit(X_ext, y_ext)
+
+    np.testing.assert_almost_equal(model_ext.coef_, model_formula.coef_)
+    np.testing.assert_array_equal(
+        model_ext.feature_names_, model_formula.feature_names_
+    )
+
+
+def test_formula_explicit_intercept(get_mixed_data):
+    data = get_mixed_data
+
+    with pytest.raises(ValueError, match="The formula sets the intercept to False"):
+        GeneralizedLinearRegressor(
+            family="normal",
+            formula="y ~ x1 - 1",
+            fit_intercept=True,
+        ).fit(data)
+
+
+@pytest.mark.parametrize(
+    "formula, feature_names, term_names",
+    [
+        pytest.param("y ~ x1 + x2", ["x1", "x2"], ["x1", "x2"], id="numeric"),
+        pytest.param(
+            "y ~ c1", ["c1[T.a]", "c1[T.b]", "c1[T.c]"], 3 * ["c1"], id="categorical"
+        ),
+        pytest.param(
+            "y ~ x1 : c1",
+            ["x1:c1[T.a]", "x1:c1[T.b]", "x1:c1[T.c]"],
+            3 * ["x1:c1"],
+            id="interaction",
+        ),
+        pytest.param(
+            "y ~ poly(x1, 3)",
+            ["poly(x1, 3)[1]", "poly(x1, 3)[2]", "poly(x1, 3)[3]"],
+            3 * ["poly(x1, 3)"],
+            id="function",
+        ),
+    ],
+)
+def test_formula_names_formulaic_style(
+    get_mixed_data, formula, feature_names, term_names
+):
+    data = get_mixed_data
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=False,
+        formula=formula,
+        categorical_format="{name}[T.{category}]",
+        interaction_separator=":",
+        alpha=1.0,
+    ).fit(data)
+
+    np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
+    np.testing.assert_array_equal(model_formula.term_names_, term_names)
+
+
+@pytest.mark.parametrize(
+    "formula, feature_names, term_names",
+    [
+        pytest.param("y ~ x1 + x2", ["x1", "x2"], ["x1", "x2"], id="numeric"),
+        pytest.param(
+            "y ~ c1", ["c1__a", "c1__b", "c1__c"], 3 * ["c1"], id="categorical"
+        ),
+        pytest.param(
+            "y ~ x1 : c1",
+            ["x1__x__c1__a", "x1__x__c1__b", "x1__x__c1__c"],
+            3 * ["x1:c1"],
+            id="interaction",
+        ),
+        pytest.param(
+            "y ~ poly(x1, 3)",
+            ["poly(x1, 3)[1]", "poly(x1, 3)[2]", "poly(x1, 3)[3]"],
+            3 * ["poly(x1, 3)"],
+            id="function",
+        ),
+    ],
+)
+def test_formula_names_old_glum_style(
+    get_mixed_data, formula, feature_names, term_names
+):
+    data = get_mixed_data
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=False,
+        formula=formula,
+        categorical_format="{name}__{category}",
+        interaction_separator="__x__",
+        alpha=1.0,
+    ).fit(data)
+
+    np.testing.assert_array_equal(model_formula.feature_names_, feature_names)
+    np.testing.assert_array_equal(model_formula.term_names_, term_names)
+
+
+@pytest.mark.parametrize(
+    "formula",
+    [
+        pytest.param("y ~ x1 + x2", id="numeric"),
+        pytest.param("y ~ c1", id="categorical"),
+        pytest.param("y ~ c1 * c2", id="categorical_interaction"),
+    ],
+)
+@pytest.mark.parametrize(
+    "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
+)
+def test_formula_against_smf(get_mixed_data, formula, fit_intercept):
+    data = get_mixed_data
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        fit_intercept=fit_intercept,
+    ).fit(data)
+
+    if fit_intercept:
+        beta_formula = np.concatenate([[model_formula.intercept_], model_formula.coef_])
+    else:
+        beta_formula = model_formula.coef_
+
+    formula_smf = formula + "- 1" if not fit_intercept else formula
+    model_smf = smf.glm(formula_smf, data, family=sm.families.Gaussian()).fit()
+
+    np.testing.assert_almost_equal(beta_formula, model_smf.params)
+
+
+def test_formula_context(get_mixed_data):
+    data = get_mixed_data
+    x_context = np.arange(len(data), dtype=float)  # noqa: F841
+    formula = "y ~ x1 + x2 + x_context"
+
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        fit_intercept=True,
+    )
+    # default is to add nothing to context
+    with pytest.raises(formulaic.errors.FactorEvaluationError):
+        model_formula.fit(data)
+
+    # set context to 0 to capture calling scope
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        fit_intercept=True,
+    ).fit(data, context=0)
+
+    model_smf = smf.glm(formula, data, family=sm.families.Gaussian()).fit()
+
+    np.testing.assert_almost_equal(
+        np.concatenate([[model_formula.intercept_], model_formula.coef_]),
+        model_smf.params,
+    )
+    np.testing.assert_almost_equal(
+        model_formula.predict(data, context=0), model_smf.predict(data)
+    )
+
+
+@pytest.mark.parametrize(
+    "formula",
+    [
+        pytest.param("y ~ x1 + x2", id="numeric"),
+        pytest.param("y ~ c1", id="categorical"),
+        pytest.param("y ~ c1 * c2", id="categorical_interaction"),
+    ],
+)
+@pytest.mark.parametrize(
+    "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
+)
+def test_formula_predict(get_mixed_data, formula, fit_intercept):
+    data = get_mixed_data
+    data_unseen = data.copy()
+    data_unseen.loc[data_unseen["c1"] == "b", "c1"] = "c"
+    model_formula = GeneralizedLinearRegressor(
+        family="normal",
+        drop_first=True,
+        formula=formula,
+        fit_intercept=fit_intercept,
+    ).fit(data)
+
+    formula_smf = formula + "- 1" if not fit_intercept else formula
+    model_smf = smf.glm(formula_smf, data, family=sm.families.Gaussian()).fit()
+
+    yhat_formula = model_formula.predict(data_unseen)
+    yhat_smf = model_smf.predict(data_unseen)
+
+    np.testing.assert_almost_equal(yhat_formula, yhat_smf)
+
+
+@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
+@pytest.mark.parametrize("unseen_missing", [False, True])
+@pytest.mark.parametrize("formula", [None, "cat_1 + cat_2"])
+def test_cat_missing(cat_missing_method, unseen_missing, formula):
+    X = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, 2, pd.NA, 2, 1]),
+            "cat_2": pd.Categorical([1, 2, pd.NA, 1, 2]),
+        }
+    )
+    if unseen_missing:
+        X = X.dropna()
+    X_unseen = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical([1, pd.NA]),
+            "cat_2": pd.Categorical([1, 2]),
+        }
+    )
+    y = np.array(X.index)
+
+    model = GeneralizedLinearRegressor(
+        family="normal",
+        cat_missing_method=cat_missing_method,
+        drop_first=False,
+        formula=formula,
+        fit_intercept=False,
+        alpha=1.0,
+    )
+    if cat_missing_method == "fail" and not unseen_missing:
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
+            model.fit(X, y)
+    else:
+        model.fit(X, y)
+        feature_names = ["cat_1[1]", "cat_1[2]", "cat_2[1]", "cat_2[2]"]
+
+        if cat_missing_method == "convert" and not unseen_missing:
+            feature_names.insert(2, "cat_1[(MISSING)]")
+            feature_names.append("cat_2[(MISSING)]")
+
+        np.testing.assert_array_equal(model.feature_names_, feature_names)
+        assert len(model.coef_) == len(feature_names)
+
+        if cat_missing_method == "fail" and unseen_missing:
+            with pytest.raises(
+                ValueError, match="Categorical data can't have missing values"
+            ):
+                model.predict(X_unseen)
+        elif cat_missing_method == "convert" and unseen_missing:
+            with pytest.raises(ValueError, match="contains unseen categories"):
+                model.predict(X_unseen)
+        else:
+            model.predict(X_unseen)
diff --git a/tests/glm/test_golden_master.py b/tests/glm/test_golden_master.py
index ac00f91b2..4f70c6a7e 100644
--- a/tests/glm/test_golden_master.py
+++ b/tests/glm/test_golden_master.py
@@ -100,25 +100,33 @@ def expected_all():
 
 
 gm_model_parameters = {
-    "default": {},  # default params
-    "half-regularization": {"alpha": 0.5},  # regularization (other than alpha = 1)
-    "elastic-net": {"l1_ratio": 0.5},  # elastic-net
-    "lasso": {"l1_ratio": 1},  # lasso
+    # TODO add an unregularized case
+    "regularization": {"alpha": 1.0},  # default prior to v3
+    "half-regularization": {"alpha": 0.5},  # regularization with alpha = 0.5
+    "elastic-net": {"l1_ratio": 0.5, "alpha": 1.0},  # elastic-net
+    "lasso": {"l1_ratio": 1, "alpha": 1.0},  # lasso
     "variable_p1": {
         "l1_ratio": 1,
         "P1": np.arange(30) / 10,
+        "alpha": 1.0,
     },  # lasso with variable penalty
     "variable_p2": {
         "l1_ratio": 0,
         "P2": _make_P2(),
+        "alpha": 1.0,
     },  # ridge with Tikhonov regularization
     "variable_p1_p2": {
         "l1_ratio": 0.5,
         "P1": np.arange(30) / 10,
         "P2": _make_P2(),
+        "alpha": 1.0,
     },  # elastic net with P1 and P2 variable penalty
-    "fit_intercept": {"fit_intercept": False},  # do not fit the intercept
-    "bounds": {"lower_bounds": np.full(30, 0), "upper_bounds": np.full(30, 0.4)},
+    "fit_intercept": {"fit_intercept": False, "alpha": 1.0},  # do not fit the intercept
+    "bounds": {
+        "lower_bounds": np.full(30, 0),
+        "upper_bounds": np.full(30, 0.4),
+        "alpha": 1.0,
+    },
 }
 
 
@@ -207,13 +215,13 @@ def test_gm_storage(distribution, data_all_storage, expected_all):
     model = fit_model(
         data=data,
         family=distribution,
-        model_parameters={},
+        model_parameters={"alpha": 1.0},
         use_weights=False,
         use_offset=False,
         cv=False,
     )
 
-    run_name = "default"
+    run_name = "regularization"
     expected = expected_all[distribution][run_name]
 
     assert_gm_allclose(model, expected)
@@ -226,7 +234,7 @@ def test_gm_custom_link(family_link, use_weights, use_offset, data_all, expected
     """Currently only testing log-linear model."""
     distribution, link = family_link
     data = data_all[distribution]
-    model_parameters = {"link": link}
+    model_parameters = {"link": link, "alpha": 1.0}
     model = fit_model(
         data=data,
         family=distribution,
@@ -257,9 +265,7 @@ def test_gm_approx_hessian(
     distribution, use_weights, use_offset, data_all, expected_all
 ):
     data = data_all[distribution]
-    model_parameters = {
-        "hessian_approx": 0.1,
-    }
+    model_parameters = {"hessian_approx": 0.1, "alpha": 1.0}
     model = fit_model(
         data=data,
         family=distribution,
@@ -269,7 +275,7 @@ def test_gm_approx_hessian(
         cv=False,
     )
 
-    run_name = "default"
+    run_name = "regularization"
     if use_weights:
         run_name = f"{run_name}_weights"
     if use_offset:
@@ -445,7 +451,7 @@ def run_and_store_golden_master(
             for use_offset in [True, False]:
                 gm_dict = run_and_store_golden_master(
                     distribution=dist,
-                    model_parameters={"link": link},
+                    model_parameters={"link": link, "alpha": 1.0},
                     run_name=f"custom-{dist}-{link}",
                     use_weights=use_weights,
                     use_offset=use_offset,
diff --git a/tests/glm/test_link.py b/tests/glm/test_link.py
index 02052afbb..7c4095d5d 100644
--- a/tests/glm/test_link.py
+++ b/tests/glm/test_link.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from glum._link import CloglogLink, Link, LogitLink, LogLink, TweedieLink
+from glum._link import CloglogLink, IdentityLink, Link, LogitLink, LogLink, TweedieLink
 
 
 @pytest.mark.parametrize("link", Link.__subclasses__())
@@ -34,9 +34,13 @@ def test_link_properties(link):
 
 
 def test_equality():
-    assert TweedieLink(1.5) == TweedieLink(1.5)
-    assert TweedieLink(1) != LogLink()
+    assert IdentityLink() == IdentityLink()
+    assert LogitLink() == LogitLink()
     assert LogLink() == LogLink()
-    assert TweedieLink(1.5) != TweedieLink(2.5)
+    assert TweedieLink(0) != IdentityLink()
+    assert TweedieLink(0) == IdentityLink().to_tweedie()
     assert TweedieLink(1.5) != LogitLink()
-    assert LogitLink() == LogitLink()
+    assert TweedieLink(1.5) != TweedieLink(2.5)
+    assert TweedieLink(1.5) == TweedieLink(1.5)
+    assert TweedieLink(1) != LogLink()
+    assert TweedieLink(1) == LogLink().to_tweedie()
diff --git a/tests/glm/test_utils.py b/tests/glm/test_utils.py
index 1960b8484..614717502 100644
--- a/tests/glm/test_utils.py
+++ b/tests/glm/test_utils.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from glum._util import _align_df_categories
+from glum._util import _add_missing_categories, _align_df_categories
 
 
 @pytest.fixture()
@@ -16,12 +16,15 @@ def df():
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
 
 def test_align_df_categories_numeric(df):
     dtypes = {column: np.float64 for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -32,33 +35,41 @@ def test_align_df_categories_numeric(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_categorical(df):
+    df = df[["x5", "x6", "x7", "x8"]]
     dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
-            "x1": [np.nan, np.nan],
-            "x2": [np.nan, np.nan],
-            "x3": [np.nan, np.nan],
-            "x4": [np.nan, np.nan],
             "x5": pd.Categorical(["a", "b"]),
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         },
         dtype=pd.CategoricalDtype(["a", "b"]),
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method),
+        expected,
+    )
 
 
 def test_align_df_categories_excess_columns(df):
     dtypes = {"x1": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -69,14 +80,19 @@ def test_align_df_categories_excess_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
 
 
 def test_align_df_categories_missing_columns(df):
     dtypes = {"x0": np.float64}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
 
     expected = pd.DataFrame(
         {
@@ -87,12 +103,145 @@ def test_align_df_categories_missing_columns(df):
             "x5": ["a", "b"],
             "x6": pd.Categorical(["a", "b"]),
             "x7": pd.Categorical(["a", "b"], categories=["b", "a"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
         }
     )
 
-    pd.testing.assert_frame_equal(_align_df_categories(df, dtypes), expected)
+    pd.testing.assert_frame_equal(
+        _align_df_categories(df, dtypes, has_missing_category, missing_method), expected
+    )
+
+
+@pytest.mark.parametrize("has_missings", [False, True])
+def test_align_df_categories_convert(df, has_missings):
+    df = df[["x5", "x6", "x7", "x8"]]
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: has_missings for column in df}
+    missing_method = "convert"
+
+    expected = pd.DataFrame(
+        {
+            "x5": pd.Categorical(["a", "b"]),
+            "x6": pd.Categorical(["a", "b"]),
+            "x7": pd.Categorical(["a", "b"]),
+            "x8": pd.Categorical(["a", pd.NA], categories=["b", "a"]),
+        },
+        dtype=pd.CategoricalDtype(["a", "b"]),
+    )
+
+    if has_missings:
+        pd.testing.assert_frame_equal(
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            ),
+            expected,
+        )
+    else:
+        with pytest.raises(ValueError, match="contains unseen categories"):
+            _align_df_categories(
+                df[["x5", "x6", "x7", "x8"]],
+                dtypes,
+                has_missing_category,
+                missing_method,
+            )
+
+
+def test_align_df_categories_raise_on_unseen(df):
+    dtypes = {column: pd.CategoricalDtype(["a", "b"]) for column in df}
+    has_missing_category = {column: False for column in df}
+    missing_method = "fail"
+
+    with pytest.raises(ValueError, match="contains unseen categories"):
+        _align_df_categories(
+            df,
+            dtypes,
+            has_missing_category,
+            missing_method,
+        )
 
 
 def test_align_df_categories_not_df():
     with pytest.raises(TypeError):
-        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64})
+        _align_df_categories(np.array([[0], [1]]), {"x0": np.float64}, {}, "fail")
+
+
+@pytest.fixture()
+def df_na():
+    return pd.DataFrame(
+        {
+            "num": np.array([0, 1], dtype="float64"),
+            "cat": pd.Categorical(["a", "b"]),
+            "cat_na": pd.Categorical(["a", pd.NA]),
+            "cat2": pd.Categorical(["a", "b"]),
+        }
+    )
+
+
+def test_add_missing_categories(df_na):
+    categorical_format = "{name}[{category}]"
+    cat_missing_name = "(M)"
+    dtypes = df_na.dtypes
+    feature_names = [
+        "num",
+        "num[(M)]",
+        "cat[a]",
+        "cat[b]",
+        "cat[(M)]",
+        "cat_na[a]",
+        "cat_na[(M)]",
+        "cat2[a]",
+        "cat2[b]",
+    ]
+
+    expected = pd.DataFrame(
+        {
+            "num": np.array([0, 1], dtype="float64"),
+            "cat": pd.Categorical(["a", "b"], categories=["a", "b", "(M)"]),
+            "cat_na": pd.Categorical(["a", "(M)"], categories=["a", "(M)"]),
+            "cat2": pd.Categorical(["a", "b"], categories=["a", "b"]),
+        }
+    )
+
+    pd.testing.assert_frame_equal(
+        _add_missing_categories(
+            df=df_na,
+            dtypes=dtypes,
+            feature_names=feature_names,
+            categorical_format=categorical_format,
+            cat_missing_name=cat_missing_name,
+        ),
+        expected,
+    )
+
+
+def test_raise_on_existing_missing(df_na):
+    categorical_format = "{name}[{category}]"
+    cat_missing_name = "(M)"
+    dtypes = df_na.dtypes
+    feature_names = [
+        "num",
+        "num[(M)]",
+        "cat[a]",
+        "cat[b]",
+        "cat[(M)]",
+        "cat_na[a]",
+        "cat_na[(M)]",
+        "cat2[a]",
+        "cat2[b]",
+    ]
+
+    df = df_na
+    df["cat_na"] = df["cat_na"].cat.add_categories("(M)")
+    df.loc[df.cat_na.isna(), "cat_na"] = "(M)"
+
+    with pytest.raises(ValueError):
+        _add_missing_categories(
+            df=df,
+            dtypes=dtypes,
+            feature_names=feature_names,
+            categorical_format=categorical_format,
+            cat_missing_name=cat_missing_name,
+        )