|
| 1 | +""" |
| 2 | +================================= |
| 3 | +Bagging classifiers using sampler |
| 4 | +================================= |
| 5 | +
|
| 6 | +In this example, we show how |
| 7 | +:class:`~imblearn.ensemble.BalancedBaggingClassifier` can be used to create a |
| 8 | +large variety of classifiers by giving different samplers. |
| 9 | +
|
| 10 | +We will give several examples that have been published in the passed year. |
| 11 | +""" |
| 12 | + |
| 13 | +# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com> |
| 14 | +# License: MIT |
| 15 | + |
| 16 | +# %% |
| 17 | +print(__doc__) |
| 18 | + |
| 19 | +# %% [markdown] |
| 20 | +# Generate an imbalanced dataset |
| 21 | +# ------------------------------ |
| 22 | +# |
| 23 | +# For this example, we will create a synthetic dataset using the function |
| 24 | +# :func:`~sklearn.datasets.make_classification`. The problem will be a toy |
| 25 | +# classification problem with a ratio of 1:9 between the two classes. |
| 26 | + |
| 27 | +# %% |
| 28 | +from sklearn.datasets import make_classification |
| 29 | + |
| 30 | +X, y = make_classification( |
| 31 | + n_samples=10_000, |
| 32 | + n_features=10, |
| 33 | + weights=[0.1, 0.9], |
| 34 | + class_sep=0.5, |
| 35 | + random_state=0, |
| 36 | +) |
| 37 | + |
| 38 | +# %% |
| 39 | +import pandas as pd |
| 40 | + |
| 41 | +pd.Series(y).value_counts(normalize=True) |
| 42 | + |
| 43 | +# %% [markdown] |
| 44 | +# In the following sections, we will show a couple of algorithms that have |
| 45 | +# been proposed over the years. We intend to illustrate how one can reuse the |
| 46 | +# :class:`~imblearn.ensemble.BalancedBaggingClassifier` by passing different |
| 47 | +# sampler. |
| 48 | + |
| 49 | +# %% |
| 50 | +from sklearn.model_selection import cross_validate |
| 51 | +from sklearn.ensemble import BaggingClassifier |
| 52 | + |
| 53 | +ebb = BaggingClassifier() |
| 54 | +cv_results = cross_validate(ebb, X, y, scoring="balanced_accuracy") |
| 55 | + |
| 56 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 57 | + |
| 58 | +# %% [markdown] |
| 59 | +# Exactly Balanced Bagging and Over-Bagging |
| 60 | +# ----------------------------------------- |
| 61 | +# |
| 62 | +# The :class:`~imblearn.ensemble.BalancedBaggingClassifier` can use in |
| 63 | +# conjunction with a :class:`~imblearn.under_sampling.RandomUnderSampler` or |
| 64 | +# :class:`~imblearn.over_sampling.RandomOverSampler`. These methods are |
| 65 | +# referred as Exactly Balanced Bagging and Over-Bagging, respectively and have |
| 66 | +# been proposed first in [1]_. |
| 67 | + |
| 68 | +# %% |
| 69 | +from imblearn.ensemble import BalancedBaggingClassifier |
| 70 | +from imblearn.under_sampling import RandomUnderSampler |
| 71 | + |
| 72 | +# Exactly Balanced Bagging |
| 73 | +ebb = BalancedBaggingClassifier(sampler=RandomUnderSampler()) |
| 74 | +cv_results = cross_validate(ebb, X, y, scoring="balanced_accuracy") |
| 75 | + |
| 76 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 77 | + |
| 78 | +# %% |
| 79 | +from imblearn.over_sampling import RandomOverSampler |
| 80 | + |
| 81 | +# Over-bagging |
| 82 | +over_bagging = BalancedBaggingClassifier(sampler=RandomOverSampler()) |
| 83 | +cv_results = cross_validate(over_bagging, X, y, scoring="balanced_accuracy") |
| 84 | + |
| 85 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 86 | + |
| 87 | +# %% [markdown] |
| 88 | +# SMOTE-Bagging |
| 89 | +# ------------- |
| 90 | +# |
| 91 | +# Instead of using a :class:`~imblearn.over_sampling.RandomOverSampler` that |
| 92 | +# make a bootstrap, an alternative is to use |
| 93 | +# :class:`~imblearn.over_sampling.SMOTE` as an over-sampler. This is known as |
| 94 | +# SMOTE-Bagging [2]_. |
| 95 | + |
| 96 | +# %% |
| 97 | +from imblearn.over_sampling import SMOTE |
| 98 | + |
| 99 | +# SMOTE-Bagging |
| 100 | +smote_bagging = BalancedBaggingClassifier(sampler=SMOTE()) |
| 101 | +cv_results = cross_validate(smote_bagging, X, y, scoring="balanced_accuracy") |
| 102 | + |
| 103 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 104 | + |
| 105 | +# %% [markdown] |
| 106 | +# Roughly Balanced Bagging |
| 107 | +# ------------------------ |
| 108 | +# While using a :class:`~imblearn.under_sampling.RandomUnderSampler` or |
| 109 | +# :class:`~imblearn.over_sampling.RandomOverSampler` will create exactly the |
| 110 | +# desired number of samples, it does not follow the statistical spirit wanted |
| 111 | +# in the bagging framework. The authors in [3]_ proposes to use a negative |
| 112 | +# binomial distribution to compute the number of samples of the majority |
| 113 | +# class to be selected and then perform a random under-sampling. |
| 114 | +# |
| 115 | +# Here, we illustrate this method by implementing a function in charge of |
| 116 | +# resampling and use the :class:`~imblearn.FunctionSampler` to integrate it |
| 117 | +# within a :class:`~imblearn.pipeline.Pipeline` and |
| 118 | +# :class:`~sklearn.model_selection.cross_validate`. |
| 119 | + |
| 120 | +# %% |
| 121 | +from collections import Counter |
| 122 | +import numpy as np |
| 123 | +from imblearn import FunctionSampler |
| 124 | + |
| 125 | + |
| 126 | +def roughly_balanced_bagging(X, y, replace=False): |
| 127 | + """Implementation of Roughly Balanced Bagging for binary problem.""" |
| 128 | + # find the minority and majority classes |
| 129 | + class_counts = Counter(y) |
| 130 | + majority_class = max(class_counts, key=class_counts.get) |
| 131 | + minority_class = min(class_counts, key=class_counts.get) |
| 132 | + |
| 133 | + # compute the number of sample to draw from the majority class using |
| 134 | + # a negative binomial distribution |
| 135 | + n_minority_class = class_counts[minority_class] |
| 136 | + n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) |
| 137 | + |
| 138 | + # draw randomly with or without replacement |
| 139 | + majority_indices = np.random.choice( |
| 140 | + np.flatnonzero(y == majority_class), |
| 141 | + size=n_majority_resampled, |
| 142 | + replace=replace, |
| 143 | + ) |
| 144 | + minority_indices = np.random.choice( |
| 145 | + np.flatnonzero(y == minority_class), |
| 146 | + size=n_minority_class, |
| 147 | + replace=replace, |
| 148 | + ) |
| 149 | + indices = np.hstack([majority_indices, minority_indices]) |
| 150 | + |
| 151 | + return X[indices], y[indices] |
| 152 | + |
| 153 | + |
| 154 | +# Roughly Balanced Bagging |
| 155 | +rbb = BalancedBaggingClassifier( |
| 156 | + sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True}) |
| 157 | +) |
| 158 | +cv_results = cross_validate(rbb, X, y, scoring="balanced_accuracy") |
| 159 | + |
| 160 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 161 | + |
| 162 | + |
| 163 | +# %% [markdown] |
| 164 | +# .. topic:: References: |
| 165 | +# |
| 166 | +# .. [1] R. Maclin, and D. Opitz. "An empirical evaluation of bagging and |
| 167 | +# boosting." AAAI/IAAI 1997 (1997): 546-551. |
| 168 | +# |
| 169 | +# .. [2] S. Wang, and X. Yao. "Diversity analysis on imbalanced data sets by |
| 170 | +# using ensemble models." 2009 IEEE symposium on computational |
| 171 | +# intelligence and data mining. IEEE, 2009. |
| 172 | +# |
| 173 | +# .. [3] S. Hido, H. Kashima, and Y. Takahashi. "Roughly balanced bagging |
| 174 | +# for imbalanced data." Statistical Analysis and Data Mining: The ASA |
| 175 | +# Data Science Journal 2.5‐6 (2009): 412-426. |
0 commit comments