Skip to content

Commit d5c540a

Browse files
committed
Pushing the docs to dev/ for branch: main, commit a3abdbb35d0429ac7f32d6eac4fe0b7e2447c65e
1 parent 030f49d commit d5c540a

File tree

1,563 files changed

+6205
-6070
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,563 files changed

+6205
-6070
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/138e7c706c17949c3098ff8074b03ce7/plot_release_highlights_1_2_0.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,11 @@
4242
preprocessor = ColumnTransformer(
4343
[
4444
("scaler", StandardScaler(), sepal_cols),
45-
("kbin", KBinsDiscretizer(encode="ordinal"), petal_cols),
45+
(
46+
"kbin",
47+
KBinsDiscretizer(encode="ordinal", quantile_method="averaged_inverted_cdf"),
48+
petal_cols,
49+
),
4650
],
4751
verbose_feature_names_out=False,
4852
).set_output(transform="pandas")
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/2e4791a177381a6102b21e44083615c8/plot_poisson_regression_non_normal_loss.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
},
7070
"outputs": [],
7171
"source": [
72-
"from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n FunctionTransformer,\n KBinsDiscretizer,\n OneHotEncoder,\n StandardScaler,\n)\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n [\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\n \"binned_numeric\",\n KBinsDiscretizer(n_bins=10, random_state=0),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n ],\n remainder=\"drop\",\n)"
72+
"from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n FunctionTransformer,\n KBinsDiscretizer,\n OneHotEncoder,\n StandardScaler,\n)\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(np.log, validate=False), StandardScaler()\n)\n\nlinear_model_preprocessor = ColumnTransformer(\n [\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\n \"binned_numeric\",\n KBinsDiscretizer(\n n_bins=10, quantile_method=\"averaged_inverted_cdf\", random_state=0\n ),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n ],\n remainder=\"drop\",\n)"
7373
]
7474
},
7575
{
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/43e84df0b93ff974da370e8da900f2ee/plot_discretization_strategies.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,12 @@
7676
i += 1
7777
# transform the dataset with KBinsDiscretizer
7878
for strategy in strategies:
79-
enc = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy=strategy)
79+
enc = KBinsDiscretizer(
80+
n_bins=4,
81+
encode="ordinal",
82+
quantile_method="averaged_inverted_cdf",
83+
strategy=strategy,
84+
)
8085
enc.fit(X)
8186
grid_encoded = enc.transform(grid)
8287

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/7341736ba71d0e04b4b71061cfe9b78e/plot_discretization.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@
4444
X = X.reshape(-1, 1)
4545

4646
# transform the dataset with KBinsDiscretizer
47-
enc = KBinsDiscretizer(n_bins=10, encode="onehot")
47+
enc = KBinsDiscretizer(
48+
n_bins=10, encode="onehot", quantile_method="averaged_inverted_cdf"
49+
)
4850
X_binned = enc.fit_transform(X)
4951

5052
# predict with original dataset
Binary file not shown.

dev/_downloads/74caedf3eb449b80f3f00e66c1c576bd/plot_discretization_classification.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ def get_name(estimator):
7272
(
7373
make_pipeline(
7474
StandardScaler(),
75-
KBinsDiscretizer(encode="onehot", random_state=0),
75+
KBinsDiscretizer(
76+
encode="onehot", quantile_method="averaged_inverted_cdf", random_state=0
77+
),
7678
LogisticRegression(random_state=0),
7779
),
7880
{
@@ -83,7 +85,9 @@ def get_name(estimator):
8385
(
8486
make_pipeline(
8587
StandardScaler(),
86-
KBinsDiscretizer(encode="onehot", random_state=0),
88+
KBinsDiscretizer(
89+
encode="onehot", quantile_method="averaged_inverted_cdf", random_state=0
90+
),
8791
LinearSVC(random_state=0),
8892
),
8993
{
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/86c888008757148890daaf43d664fa71/plot_tweedie_regression_insurance_claims.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,9 @@ def score_estimator(
239239
[
240240
(
241241
"binned_numeric",
242-
KBinsDiscretizer(n_bins=10, random_state=0),
242+
KBinsDiscretizer(
243+
n_bins=10, quantile_method="averaged_inverted_cdf", random_state=0
244+
),
243245
["VehAge", "DrivAge"],
244246
),
245247
(
@@ -689,8 +691,7 @@ def lorenz_curve(y_true, y_pred, exposure):
689691
ax.set(
690692
title="Lorenz Curves",
691693
xlabel=(
692-
"Cumulative proportion of exposure\n"
693-
"(ordered by model from safest to riskiest)"
694+
"Cumulative proportion of exposure\n(ordered by model from safest to riskiest)"
694695
),
695696
ylabel="Cumulative proportion of claim amounts",
696697
)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/a97bf662e52d471b04e1ab480c0ad7f2/plot_tweedie_regression_insurance_claims.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
},
3434
"outputs": [],
3535
"source": [
36-
"from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n FunctionTransformer,\n KBinsDiscretizer,\n OneHotEncoder,\n StandardScaler,\n)\n\ndf = load_mtpl2()\n\n\n# Correct for unreasonable observations (that might be data error)\n# and a few exceptionally large claim amounts\ndf[\"ClaimNb\"] = df[\"ClaimNb\"].clip(upper=4)\ndf[\"Exposure\"] = df[\"Exposure\"].clip(upper=1)\ndf[\"ClaimAmount\"] = df[\"ClaimAmount\"].clip(upper=200000)\n# If the claim amount is 0, then we do not count it as a claim. The loss function\n# used by the severity model needs strictly positive claim amounts. This way\n# frequency and severity are more consistent with each other.\ndf.loc[(df[\"ClaimAmount\"] == 0) & (df[\"ClaimNb\"] >= 1), \"ClaimNb\"] = 0\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(func=np.log), StandardScaler()\n)\n\ncolumn_trans = ColumnTransformer(\n [\n (\n \"binned_numeric\",\n KBinsDiscretizer(n_bins=10, random_state=0),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n ],\n remainder=\"drop\",\n)\nX = column_trans.fit_transform(df)\n\n# Insurances companies are interested in modeling the Pure Premium, that is\n# the expected total claim amount per unit of exposure for each policyholder\n# in their portfolio:\ndf[\"PurePremium\"] = df[\"ClaimAmount\"] / df[\"Exposure\"]\n\n# This can be indirectly approximated by a 2-step modeling: the product of the\n# Frequency times the average claim amount per claim:\ndf[\"Frequency\"] = df[\"ClaimNb\"] / df[\"Exposure\"]\ndf[\"AvgClaimAmount\"] = df[\"ClaimAmount\"] / np.fmax(df[\"ClaimNb\"], 1)\n\nwith pd.option_context(\"display.max_columns\", 15):\n print(df[df.ClaimAmount > 0].head())"
36+
"from sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import (\n FunctionTransformer,\n KBinsDiscretizer,\n OneHotEncoder,\n StandardScaler,\n)\n\ndf = load_mtpl2()\n\n\n# Correct for unreasonable observations (that might be data error)\n# and a few exceptionally large claim amounts\ndf[\"ClaimNb\"] = df[\"ClaimNb\"].clip(upper=4)\ndf[\"Exposure\"] = df[\"Exposure\"].clip(upper=1)\ndf[\"ClaimAmount\"] = df[\"ClaimAmount\"].clip(upper=200000)\n# If the claim amount is 0, then we do not count it as a claim. The loss function\n# used by the severity model needs strictly positive claim amounts. This way\n# frequency and severity are more consistent with each other.\ndf.loc[(df[\"ClaimAmount\"] == 0) & (df[\"ClaimNb\"] >= 1), \"ClaimNb\"] = 0\n\nlog_scale_transformer = make_pipeline(\n FunctionTransformer(func=np.log), StandardScaler()\n)\n\ncolumn_trans = ColumnTransformer(\n [\n (\n \"binned_numeric\",\n KBinsDiscretizer(\n n_bins=10, quantile_method=\"averaged_inverted_cdf\", random_state=0\n ),\n [\"VehAge\", \"DrivAge\"],\n ),\n (\n \"onehot_categorical\",\n OneHotEncoder(),\n [\"VehBrand\", \"VehPower\", \"VehGas\", \"Region\", \"Area\"],\n ),\n (\"passthrough_numeric\", \"passthrough\", [\"BonusMalus\"]),\n (\"log_scaled_numeric\", log_scale_transformer, [\"Density\"]),\n ],\n remainder=\"drop\",\n)\nX = column_trans.fit_transform(df)\n\n# Insurances companies are interested in modeling the Pure Premium, that is\n# the expected total claim amount per unit of exposure for each policyholder\n# in their portfolio:\ndf[\"PurePremium\"] = df[\"ClaimAmount\"] / df[\"Exposure\"]\n\n# This can be indirectly approximated by a 2-step modeling: the product of the\n# Frequency times the average claim amount per claim:\ndf[\"Frequency\"] = df[\"ClaimNb\"] / df[\"Exposure\"]\ndf[\"AvgClaimAmount\"] = df[\"ClaimAmount\"] / np.fmax(df[\"ClaimNb\"], 1)\n\nwith pd.option_context(\"display.max_columns\", 15):\n print(df[df.ClaimAmount > 0].head())"
3737
]
3838
},
3939
{
@@ -242,7 +242,7 @@
242242
},
243243
"outputs": [],
244244
"source": [
245-
"from sklearn.metrics import auc\n\n\ndef lorenz_curve(y_true, y_pred, exposure):\n y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)\n exposure = np.asarray(exposure)\n\n # order samples by increasing predicted risk:\n ranking = np.argsort(y_pred)\n ranked_exposure = exposure[ranking]\n ranked_pure_premium = y_true[ranking]\n cumulative_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)\n cumulative_claim_amount /= cumulative_claim_amount[-1]\n cumulative_exposure = np.cumsum(ranked_exposure)\n cumulative_exposure /= cumulative_exposure[-1]\n return cumulative_exposure, cumulative_claim_amount\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\ny_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)\ny_pred_total = glm_pure_premium.predict(X_test)\n\nfor label, y_pred in [\n (\"Frequency * Severity model\", y_pred_product),\n (\"Compound Poisson Gamma\", y_pred_total),\n]:\n cum_exposure, cum_claims = lorenz_curve(\n df_test[\"PurePremium\"], y_pred, df_test[\"Exposure\"]\n )\n gini = 1 - 2 * auc(cum_exposure, cum_claims)\n label += \" (Gini index: {:.3f})\".format(gini)\n ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = lorenz_curve(\n df_test[\"PurePremium\"], df_test[\"PurePremium\"], df_test[\"Exposure\"]\n)\ngini = 1 - 2 * auc(cum_exposure, cum_claims)\nlabel = \"Oracle (Gini index: {:.3f})\".format(gini)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\", label=\"Random baseline\")\nax.set(\n title=\"Lorenz Curves\",\n xlabel=(\n \"Cumulative proportion of exposure\\n\"\n \"(ordered by model from safest to riskiest)\"\n ),\n ylabel=\"Cumulative proportion of claim amounts\",\n)\nax.legend(loc=\"upper left\")\nplt.plot()"
245+
"from sklearn.metrics import auc\n\n\ndef lorenz_curve(y_true, y_pred, exposure):\n y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)\n exposure = np.asarray(exposure)\n\n # order samples by increasing predicted risk:\n ranking = np.argsort(y_pred)\n ranked_exposure = exposure[ranking]\n ranked_pure_premium = y_true[ranking]\n cumulative_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)\n cumulative_claim_amount /= cumulative_claim_amount[-1]\n cumulative_exposure = np.cumsum(ranked_exposure)\n cumulative_exposure /= cumulative_exposure[-1]\n return cumulative_exposure, cumulative_claim_amount\n\n\nfig, ax = plt.subplots(figsize=(8, 8))\n\ny_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)\ny_pred_total = glm_pure_premium.predict(X_test)\n\nfor label, y_pred in [\n (\"Frequency * Severity model\", y_pred_product),\n (\"Compound Poisson Gamma\", y_pred_total),\n]:\n cum_exposure, cum_claims = lorenz_curve(\n df_test[\"PurePremium\"], y_pred, df_test[\"Exposure\"]\n )\n gini = 1 - 2 * auc(cum_exposure, cum_claims)\n label += \" (Gini index: {:.3f})\".format(gini)\n ax.plot(cum_exposure, cum_claims, linestyle=\"-\", label=label)\n\n# Oracle model: y_pred == y_test\ncum_exposure, cum_claims = lorenz_curve(\n df_test[\"PurePremium\"], df_test[\"PurePremium\"], df_test[\"Exposure\"]\n)\ngini = 1 - 2 * auc(cum_exposure, cum_claims)\nlabel = \"Oracle (Gini index: {:.3f})\".format(gini)\nax.plot(cum_exposure, cum_claims, linestyle=\"-.\", color=\"gray\", label=label)\n\n# Random baseline\nax.plot([0, 1], [0, 1], linestyle=\"--\", color=\"black\", label=\"Random baseline\")\nax.set(\n title=\"Lorenz Curves\",\n xlabel=(\n \"Cumulative proportion of exposure\\n(ordered by model from safest to riskiest)\"\n ),\n ylabel=\"Cumulative proportion of claim amounts\",\n)\nax.legend(loc=\"upper left\")\nplt.plot()"
246246
]
247247
}
248248
],

0 commit comments

Comments
 (0)