From c2d88b23bbace810ce333f45ba0d3e1647da3cfd Mon Sep 17 00:00:00 2001
From: Matthias Schmidtblaicher <matthias.schmidtblaicher@quantco.com>
Date: Mon, 15 Jan 2024 09:51:48 +0100
Subject: [PATCH] describe more restrictive behavior in tutorial

---
 .../formula_interface/formula_interface.ipynb | 137 +-----------------
 1 file changed, 2 insertions(+), 135 deletions(-)
diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
index acdf50ea..d396adc0 100644
--- a/docs/tutorials/formula_interface/formula_interface.ipynb
+++ b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -1430,140 +1430,7 @@
    "source": [
     "### Missing Values in Categorical Columns\n",
     "\n",
-    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n",
-    "\n",
-    "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intercept</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[0]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[1]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[2]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[3]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[4]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[5]</th>\n",
-       "      <th>C(DrivAge, missing_method='zero')[6]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[4]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[5]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[6]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[7]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[8]</th>\n",
-       "      <th>C(VehPower, missing_method='convert')[9]</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>coefficient</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.786703</td>\n",
-       "      <td>0.742765</td>\n",
-       "      <td>0.239528</td>\n",
-       "      <td>0.096531</td>\n",
-       "      <td>0.071118</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.201078</td>\n",
-       "      <td>4.637267</td>\n",
-       "      <td>4.679391</td>\n",
-       "      <td>4.863387</td>\n",
-       "      <td>4.77263</td>\n",
-       "      <td>4.749673</td>\n",
-       "      <td>4.970188</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             intercept  C(DrivAge, missing_method='zero')[0]  \\\n",
-       "coefficient        0.0                              1.786703   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[1]  \\\n",
-       "coefficient                              0.742765   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[2]  \\\n",
-       "coefficient                              0.239528   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[3]  \\\n",
-       "coefficient                              0.096531   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[4]  \\\n",
-       "coefficient                              0.071118   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[5]  \\\n",
-       "coefficient                                   0.0   \n",
-       "\n",
-       "             C(DrivAge, missing_method='zero')[6]  \\\n",
-       "coefficient                              0.201078   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[4]  \\\n",
-       "coefficient                                  4.637267   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[5]  \\\n",
-       "coefficient                                  4.679391   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[6]  \\\n",
-       "coefficient                                  4.863387   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[7]  \\\n",
-       "coefficient                                   4.77263   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[8]  \\\n",
-       "coefficient                                  4.749673   \n",
-       "\n",
-       "             C(VehPower, missing_method='convert')[9]  \n",
-       "coefficient                                  4.970188  "
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n",
-    "\n",
-    "t_glm8 = GeneralizedLinearRegressor(\n",
-    "    family=TweedieDist,\n",
-    "    alpha_search=True,\n",
-    "    l1_ratio=1,\n",
-    "    fit_intercept=False,\n",
-    "    formula=formula_missing,\n",
-    "\n",
-    ")\n",
-    "t_glm8.fit(\n",
-    "    X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n",
-    ")\n",
-    "\n",
-    "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm8.feature_names_,\n",
-    ").T"
+    "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option). The treatment of missings should be set globally by the `cat_missing_method` parameter during model initialization. The `\"convert\"` option is only valid for the interface without a formula."
    ]
   }
  ],
@@ -1583,7 +1450,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.1"
   },
   "orig_nbformat": 4
  },