Merge branch 'glum-v3' into convert-nas-unseen

MatthiasSchmidtblaicherQC · MatthiasSchmidtblaicherQC · commit b8557583bdd5 · 2024-01-15T09:24:55.000+01:00
diff --git a/README.md b/README.md
@@ -68,7 +68,7 @@ Why did we choose the name `glum`? We wanted a name that had the letters GLM and
 >>>
 >>> _ = model.fit(X=X, y=y)
 >>>
->>> # .report_diagnostics shows details about the steps taken by the iterative solver
+>>> # .report_diagnostics shows details about the steps taken by the iterative solver.
 >>> diags = model.get_formatted_diagnostics(full_report=True)
 >>> diags[['objective_fct']]
         objective_fct
@@ -79,6 +79,15 @@ n_iter
 3            0.443681
 4            0.443498
 5            0.443497
+>>>
+>>> # Models can also be built with formulas from formulaic.
+>>> model_formula = GeneralizedLinearRegressor(
+...     family='binomial',
+...     l1_ratio=1.0,
+...     alpha=0.001,
+...     formula="bedrooms + np.log(bathrooms + 1) + bs(sqft_living, 3) + C(waterfront)"
+... )
+>>> _ = model_formula.fit(X=house_data.data, y=y)
 
 ```
 
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
@@ -35,7 +35,7 @@ requirements:
     - pandas
     - scikit-learn >=0.23
     - scipy
-    - formulaic >=0.4
+    - formulaic >=0.6
     - tabmat >=4.0.0a
 
 test:
diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -23,7 +23,7 @@
     "Formulas can provide a concise and convenient way to specify many of the usual pre-processing steps, such as converting to categorical types, creating interactions, applying transformations, or even spline interpolation. As an example, consider the following formula:\n",
     "\n",
     "```\n",
-    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3) + 1\n",
+    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3)\n",
     "```\n",
     "\n",
     "Despite its brevity, it describes all of the following:\n",
@@ -32,7 +32,6 @@
     " - If there are missing values in `DrivAge`, they should be treated as a separate category.\n",
     " - On the other hand, missing values in `VehPower` should be treated as all-zero indicators.\n",
     " - The predictors should also include a third degree B-spline interpolation of `BonusMalus`.\n",
-    " - The model should include an intercept.\n",
     "\n",
     "The following chapters demonstrate each of these features in some detail, as well as some additional advantages of using the formula interface."
    ]
@@ -59,6 +58,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "import pytest\n",
     "import scipy.optimize as optimize\n",
     "import scipy.stats\n",
     "from dask_ml.preprocessing import Categorizer\n",
@@ -1261,144 +1261,27 @@
    "source": [
     "### Intercept Term\n",
     "\n",
-    "Just like in the case of the non-formula interface, an intercept term is added by default. This can be disabled by either setting the `fit_intercept` parameter to `False`, or adding `+0` or `-1` to the end of the formula. In the case of conflict, a warning is emitted, and the latter takes precedence."
+    "Just like in the case of the non-formula interface, the presence of an intercept is determined by the `fit_intercept` argument. In case that the formula specifies a different behavior (e.g., adding `+0` or `-1` while `fit_intercept=True`), an error will be raised."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/stanmart/work/glum/src/glum/_glm.py:2354: UserWarning: The formula explicitly sets the intercept to False, overriding fit_intercept=True.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intercept</th>\n",
-       "      <th>DrivAge__0</th>\n",
-       "      <th>DrivAge__1</th>\n",
-       "      <th>DrivAge__2</th>\n",
-       "      <th>DrivAge__3</th>\n",
-       "      <th>DrivAge__4</th>\n",
-       "      <th>DrivAge__5</th>\n",
-       "      <th>DrivAge__6</th>\n",
-       "      <th>VehPower__4</th>\n",
-       "      <th>VehPower__5</th>\n",
-       "      <th>...</th>\n",
-       "      <th>DrivAge__4__x__VehPower__8</th>\n",
-       "      <th>DrivAge__5__x__VehPower__8</th>\n",
-       "      <th>DrivAge__6__x__VehPower__8</th>\n",
-       "      <th>DrivAge__0__x__VehPower__9</th>\n",
-       "      <th>DrivAge__1__x__VehPower__9</th>\n",
-       "      <th>DrivAge__2__x__VehPower__9</th>\n",
-       "      <th>DrivAge__3__x__VehPower__9</th>\n",
-       "      <th>DrivAge__4__x__VehPower__9</th>\n",
-       "      <th>DrivAge__5__x__VehPower__9</th>\n",
-       "      <th>DrivAge__6__x__VehPower__9</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>coefficient</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.713298</td>\n",
-       "      <td>0.783505</td>\n",
-       "      <td>0.205914</td>\n",
-       "      <td>0.016085</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.000094</td>\n",
-       "      <td>0.223685</td>\n",
-       "      <td>4.66123</td>\n",
-       "      <td>4.736272</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-0.144927</td>\n",
-       "      <td>0.001657</td>\n",
-       "      <td>0.515373</td>\n",
-       "      <td>0.714834</td>\n",
-       "      <td>-0.325666</td>\n",
-       "      <td>-0.370935</td>\n",
-       "      <td>0.20417</td>\n",
-       "      <td>0.013222</td>\n",
-       "      <td>-0.273913</td>\n",
-       "      <td>0.115693</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1 rows × 56 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
-       "coefficient        0.0    1.713298    0.783505    0.205914    0.016085   \n",
-       "\n",
-       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
-       "coefficient         0.0    0.000094    0.223685      4.66123     4.736272   \n",
-       "\n",
-       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
-       "coefficient  ...                   -0.144927                    0.001657   \n",
-       "\n",
-       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
-       "coefficient                    0.515373                    0.714834   \n",
-       "\n",
-       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
-       "coefficient                   -0.325666                   -0.370935   \n",
-       "\n",
-       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
-       "coefficient                     0.20417                    0.013222   \n",
-       "\n",
-       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
-       "coefficient                   -0.273913                    0.115693  \n",
-       "\n",
-       "[1 rows x 56 columns]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "formula_noint = \"PurePremium ~ DrivAge * VehPower - 1\"\n",
     "\n",
-    "t_glm6 = GeneralizedLinearRegressor(\n",
-    "    family=TweedieDist,\n",
-    "    alpha_search=True,\n",
-    "    l1_ratio=1,\n",
-    "    fit_intercept=True,\n",
-    "    formula=formula_noint,\n",
-    "    interaction_separator=\"__x__\",\n",
-    "    categorical_format=\"{name}__{category}\",\n",
-    ")\n",
-    "t_glm6.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
-    "\n",
-    "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm6.intercept_], t_glm6.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm6.feature_names_,\n",
-    ").T"
+    "with pytest.raises(ValueError, match=\"The formula sets the intercept to False\"):\n",
+    "    t_glm6 = GeneralizedLinearRegressor(\n",
+    "        family=TweedieDist,\n",
+    "        alpha_search=True,\n",
+    "        l1_ratio=1,\n",
+    "        fit_intercept=True,\n",
+    "        formula=formula_noint,\n",
+    "        interaction_separator=\"__x__\",\n",
+    "        categorical_format=\"{name}__{category}\",\n",
+    "    )"
    ]
   },
   {
diff --git a/setup.py b/setup.py
@@ -86,7 +86,7 @@
         "pandas",
         "scikit-learn>=0.23",
         "scipy",
-        "formulaic>=0.4",
+        "formulaic>=0.6",
         "tabmat>=4.0.0a",
     ],
     entry_points=None
diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
@@ -1355,7 +1355,7 @@ def guess_intercept(
             second = np.log((mu ** (2 - p)).dot(sample_weight))
         return first - second
     elif isinstance(link, LogitLink):
-        log_odds = np.log(avg_y) - np.log(np.average(1 - y, weights=sample_weight))
+        log_odds = np.log(avg_y) - np.log(1 - avg_y)
         if eta is None:
             return log_odds
         avg_eta = eta if np.isscalar(eta) else np.average(eta, weights=sample_weight)
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -244,8 +244,7 @@ def _parse_formula(
     formula : FormulaSpec
         The formula to parse.
     include_intercept: bool, default True
-        Whether to include an intercept column if the formula does not
-        include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly.
+        Whether to include an intercept column.
 
     Returns
     -------
@@ -2683,11 +2682,11 @@ def _set_up_and_check_fit_args(
 
                 intercept = "1" in X.model_spec.terms
                 if intercept != self.fit_intercept:
-                    warnings.warn(
-                        f"The formula explicitly sets the intercept to {intercept}, "
-                        f"overriding fit_intercept={self.fit_intercept}."
+                    raise ValueError(
+                        f"The formula sets the intercept to {intercept}, "
+                        f"contradicting fit_intercept={self.fit_intercept}. "
+                        "You should use fit_intercept to specify the intercept."
                     )
-                    self.fit_intercept = intercept
 
                 self.X_model_spec_ = X.model_spec
 
@@ -3114,6 +3113,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     expected_information : bool, optional (default = False)
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
+
     formula : FormulaSpec
         A formula accepted by formulaic. It can either be a one-sided formula, in
         which case ``y`` must be specified in ``fit``, or a two-sided formula, in
@@ -3140,6 +3140,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         - if 'zero', missing values will represent all-zero indicator columns.
         - if 'convert', missing values will be converted to the ``cat_missing_name``
           category.
+
     cat_missing_name: str, default='(MISSING)'
         Name of the category to which missing values will be converted if
         ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py