From c2d88b23bbace810ce333f45ba0d3e1647da3cfd Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Mon, 15 Jan 2024 09:51:48 +0100 Subject: [PATCH] describe more restrictive behavior in tutorial --- .../formula_interface/formula_interface.ipynb | 137 +----------------- 1 file changed, 2 insertions(+), 135 deletions(-) diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb index acdf50ea..d396adc0 100644 --- a/docs/tutorials/formula_interface/formula_interface.ipynb +++ b/docs/tutorials/formula_interface/formula_interface.ipynb @@ -1430,140 +1430,7 @@ "source": [ "### Missing Values in Categorical Columns\n", "\n", - "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option).\n", - "\n", - "Similarly to the non-formula-based interface, `glum`'s behavior can be set globally using the `cat_missing_method` parameter during model initialization. However, formulas provide some additional flexibility: the `C` function has a `missing_method` parameter, with which users can select an option on a column-by-column basis. Here is an example of doing that (although our dataset does not have any missing values, so these options have no actual effect in this case):" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
interceptC(DrivAge, missing_method='zero')[0]C(DrivAge, missing_method='zero')[1]C(DrivAge, missing_method='zero')[2]C(DrivAge, missing_method='zero')[3]C(DrivAge, missing_method='zero')[4]C(DrivAge, missing_method='zero')[5]C(DrivAge, missing_method='zero')[6]C(VehPower, missing_method='convert')[4]C(VehPower, missing_method='convert')[5]C(VehPower, missing_method='convert')[6]C(VehPower, missing_method='convert')[7]C(VehPower, missing_method='convert')[8]C(VehPower, missing_method='convert')[9]
coefficient0.01.7867030.7427650.2395280.0965310.0711180.00.2010784.6372674.6793914.8633874.772634.7496734.970188
\n", - "
" - ], - "text/plain": [ - " intercept C(DrivAge, missing_method='zero')[0] \\\n", - "coefficient 0.0 1.786703 \n", - "\n", - " C(DrivAge, missing_method='zero')[1] \\\n", - "coefficient 0.742765 \n", - "\n", - " C(DrivAge, missing_method='zero')[2] \\\n", - "coefficient 0.239528 \n", - "\n", - " C(DrivAge, missing_method='zero')[3] \\\n", - "coefficient 0.096531 \n", - "\n", - " C(DrivAge, missing_method='zero')[4] \\\n", - "coefficient 0.071118 \n", - "\n", - " C(DrivAge, missing_method='zero')[5] \\\n", - "coefficient 0.0 \n", - "\n", - " C(DrivAge, missing_method='zero')[6] \\\n", - "coefficient 0.201078 \n", - "\n", - " C(VehPower, missing_method='convert')[4] \\\n", - "coefficient 4.637267 \n", - "\n", - " C(VehPower, missing_method='convert')[5] \\\n", - "coefficient 4.679391 \n", - "\n", - " C(VehPower, missing_method='convert')[6] \\\n", - "coefficient 4.863387 \n", - "\n", - " C(VehPower, missing_method='convert')[7] \\\n", - "coefficient 4.77263 \n", - "\n", - " C(VehPower, missing_method='convert')[8] \\\n", - "coefficient 4.749673 \n", - "\n", - " C(VehPower, missing_method='convert')[9] \n", - "coefficient 4.970188 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "formula_missing = \"C(DrivAge, missing_method='zero') + C(VehPower, missing_method='convert')\"\n", - "\n", - "t_glm8 = GeneralizedLinearRegressor(\n", - " family=TweedieDist,\n", - " alpha_search=True,\n", - " l1_ratio=1,\n", - " fit_intercept=False,\n", - " formula=formula_missing,\n", - "\n", - ")\n", - "t_glm8.fit(\n", - " X=df_train, y=df_train[\"PurePremium\"], sample_weight=df[\"Exposure\"].values[train]\n", - ")\n", - "\n", - "pd.DataFrame(\n", - " {\"coefficient\": np.concatenate(([t_glm8.intercept_], t_glm8.coef_))},\n", - " index=[\"intercept\"] + t_glm8.feature_names_,\n", - ").T" + "By default, `glum` raises a `ValueError` when it encounters a missing value in a categorical variable (`\"raise\"` option). However, there are two other options for handling these cases. They can also be treated as if they represented all-zeros indicators (`\"zero\"` option, which is also the way `pandas.get_dummies` works) or missing values can be treated as their own separate category (`\"convert\"` option). The treatment of missings should be set globally by the `cat_missing_method` parameter during model initialization. The `\"convert\"` option is only valid for the interface without a formula." ] } ], @@ -1583,7 +1450,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.1" }, "orig_nbformat": 4 },