|
| 1 | +from numbers import Number |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +from beartype import beartype |
| 6 | +from beartype.typing import Literal, Optional, Sequence, Tuple, Union |
| 7 | +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor |
| 8 | + |
| 9 | +from eis_toolkit import exceptions |
| 10 | +from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model |
| 11 | + |
| 12 | + |
| 13 | +@beartype |
| 14 | +def gradient_boosting_classifier_train( |
| 15 | + X: Union[np.ndarray, pd.DataFrame], |
| 16 | + y: Union[np.ndarray, pd.Series], |
| 17 | + validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split", |
| 18 | + metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"], |
| 19 | + split_size: float = 0.2, |
| 20 | + cv_folds: int = 5, |
| 21 | + loss: Literal["log_loss", "exponential"] = "log_loss", |
| 22 | + learning_rate: Number = 0.1, |
| 23 | + n_estimators: int = 100, |
| 24 | + max_depth: Optional[int] = 3, |
| 25 | + subsample: Number = 1.0, |
| 26 | + verbose: int = 0, |
| 27 | + random_state: Optional[int] = None, |
| 28 | + **kwargs, |
| 29 | +) -> Tuple[GradientBoostingClassifier, dict]: |
| 30 | + """ |
| 31 | + Train and optionally validate a Gradient Boosting classifier model using Sklearn. |
| 32 | +
|
| 33 | + Various options and configurations for model performance evaluation are available. No validation, |
| 34 | + split to train and validation parts, and cross-validation can be chosen. If validation is performed, |
| 35 | + metric(s) to calculate can be defined and validation process configured (cross-validation method, |
| 36 | + number of folds, size of the split). Depending on the details of the validation process, |
| 37 | + the output metrics dictionary can be empty, one-dimensional or nested. |
| 38 | +
|
| 39 | + Args: |
| 40 | + X: Training data. |
| 41 | + y: Target labels. |
| 42 | + validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv" |
| 43 | + performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation, |
| 44 | + "loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all |
| 45 | + (in this case, all X and y will be used solely for training). |
| 46 | + metrics: Metrics to use for scoring the model. Defaults to "accuracy". |
| 47 | + split_size: Fraction of the dataset to be used as validation data (rest is used for training). |
| 48 | + Used only when validation_method is "split". Defaults to 0.2. |
| 49 | + cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv" |
| 50 | + or "skfold_cv". Defaults to 5. |
| 51 | + loss: The loss function to be optimized. Defaults to "log_loss" (same as in logistic regression). |
| 52 | + learning_rate: Shrinks the contribution of each tree. Values must be >= 0. Defaults to 0.1. |
| 53 | + n_estimators: The number of boosting stages to run. Gradient boosting is fairly robust to over-fitting |
| 54 | + so a large number can result in better performance. Values must be >= 1. Defaults to 100. |
| 55 | + max_depth: Maximum depth of the individual regression estimators. The maximum depth limits the number |
| 56 | + of nodes in the tree. Values must be >= 1 or None, in which case nodes are expanded until all leaves |
| 57 | + are pure or until all leaves contain less than min_samples_split samples. Defaults to 3. |
| 58 | + subsample: The fraction of samples to be used for fitting the individual base learners. |
| 59 | + If smaller than 1.0 this results in Stochastic Gradient Boosting. Subsample interacts with the |
| 60 | + parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias. |
| 61 | + Values must be in the range 0.0 < x <= 1.0. Defaults to 1.0. |
| 62 | + verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print, |
| 63 | + 1 prints once in a while depending on the number of tress, 2 or above will print for every tree. |
| 64 | + random_state: Seed for random number generation. Defaults to None. |
| 65 | + **kwargs: Additional parameters for Sklearn's GradientBoostingClassifier. |
| 66 | +
|
| 67 | + Returns: |
| 68 | + The trained GradientBoostingClassifier and metric scores as a dictionary. |
| 69 | +
|
| 70 | + Raises: |
| 71 | + InvalidParameterValueException: If some of the numeric parameters are given invalid input values. |
| 72 | + """ |
| 73 | + if not learning_rate >= 0: |
| 74 | + raise exceptions.InvalidParameterValueException("Learning rate must be non-negative.") |
| 75 | + if not n_estimators >= 1: |
| 76 | + raise exceptions.InvalidParameterValueException("N-estimators must be at least 1.") |
| 77 | + if max_depth is not None and not max_depth >= 1: |
| 78 | + raise exceptions.InvalidParameterValueException("Max depth must be at least 1 or None.") |
| 79 | + if not (0 < subsample <= 1): |
| 80 | + raise exceptions.InvalidParameterValueException("Subsample must be more than 0 and at most 1.") |
| 81 | + if verbose < 0: |
| 82 | + raise exceptions.InvalidParameterValueException("Verbose must be a non-negative number.") |
| 83 | + |
| 84 | + model = GradientBoostingClassifier( |
| 85 | + loss=loss, |
| 86 | + learning_rate=learning_rate, |
| 87 | + n_estimators=n_estimators, |
| 88 | + max_depth=max_depth, |
| 89 | + subsample=subsample, |
| 90 | + random_state=random_state, |
| 91 | + verbose=verbose, |
| 92 | + **kwargs, |
| 93 | + ) |
| 94 | + |
| 95 | + model, metrics = _train_and_validate_sklearn_model( |
| 96 | + X=X, |
| 97 | + y=y, |
| 98 | + model=model, |
| 99 | + validation_method=validation_method, |
| 100 | + metrics=metrics, |
| 101 | + split_size=split_size, |
| 102 | + cv_folds=cv_folds, |
| 103 | + random_state=random_state, |
| 104 | + ) |
| 105 | + |
| 106 | + return model, metrics |
| 107 | + |
| 108 | + |
| 109 | +@beartype |
| 110 | +def gradient_boosting_regressor_train( |
| 111 | + X: Union[np.ndarray, pd.DataFrame], |
| 112 | + y: Union[np.ndarray, pd.Series], |
| 113 | + validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split", |
| 114 | + metrics: Sequence[Literal["mse", "rmse", "mae", "r2"]] = ["mse"], |
| 115 | + split_size: float = 0.2, |
| 116 | + cv_folds: int = 5, |
| 117 | + loss: Literal["squared_error", "absolute_error", "huber", "quantile"] = "squared_error", |
| 118 | + learning_rate: Number = 0.1, |
| 119 | + n_estimators: int = 100, |
| 120 | + max_depth: Optional[int] = 3, |
| 121 | + subsample: Number = 1.0, |
| 122 | + verbose: int = 0, |
| 123 | + random_state: Optional[int] = None, |
| 124 | + **kwargs, |
| 125 | +) -> Tuple[GradientBoostingRegressor, dict]: |
| 126 | + """ |
| 127 | + Train and optionally validate a Gradient Boosting regressor model using Sklearn. |
| 128 | +
|
| 129 | + Various options and configurations for model performance evaluation are available. No validation, |
| 130 | + split to train and validation parts, and cross-validation can be chosen. If validation is performed, |
| 131 | + metric(s) to calculate can be defined and validation process configured (cross-validation method, |
| 132 | + number of folds, size of the split). Depending on the details of the validation process, |
| 133 | + the output metrics dictionary can be empty, one-dimensional or nested. |
| 134 | +
|
| 135 | + Args: |
| 136 | + X: Training data. |
| 137 | + y: Target labels. |
| 138 | + validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv" |
| 139 | + performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation, |
| 140 | + "loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all |
| 141 | + (in this case, all X and y will be used solely for training). |
| 142 | + metrics: Metrics to use for scoring the model. Defaults to "mse". |
| 143 | + split_size: Fraction of the dataset to be used as validation data (rest is used for training). |
| 144 | + Used only when validation_method is "split". Defaults to 0.2. |
| 145 | + cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv" |
| 146 | + or "skfold_cv". Defaults to 5. |
| 147 | + loss: The loss function to be optimized. Defaults to "squared_error". |
| 148 | + learning_rate: Shrinks the contribution of each tree. Values must be > 0. Defaults to 0.1. |
| 149 | + n_estimators: The number of boosting stages to run. Gradient boosting is fairly robust to over-fitting |
| 150 | + so a large number can result in better performance. Values must be >= 1. Defaults to 100. |
| 151 | + max_depth: Maximum depth of the individual regression estimators. The maximum depth limits the number |
| 152 | + of nodes in the tree. Values must be >= 1 or None, in which case nodes are expanded until all leaves |
| 153 | + are pure or until all leaves contain less than min_samples_split samples. Defaults to 3. |
| 154 | + subsample: The fraction of samples to be used for fitting the individual base learners. |
| 155 | + If smaller than 1.0 this results in Stochastic Gradient Boosting. Subsample interacts with the |
| 156 | + parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias. |
| 157 | + Values must be in the range 0.0 < x <= 1.0. Defaults to 1. |
| 158 | + verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print, |
| 159 | + 1 prints once in a while depending on the number of tress, 2 or above will print for every tree. |
| 160 | + random_state: Seed for random number generation. Defaults to None. |
| 161 | + **kwargs: Additional parameters for Sklearn's GradientBoostingRegressor. |
| 162 | +
|
| 163 | + Returns: |
| 164 | + The trained GradientBoostingRegressor and metric scores as a dictionary. |
| 165 | +
|
| 166 | + Raises: |
| 167 | + InvalidParameterValueException: If some of the numeric parameters are given invalid input values. |
| 168 | + """ |
| 169 | + if not learning_rate >= 0: |
| 170 | + raise exceptions.InvalidParameterValueException("Learning rate must be non-negative.") |
| 171 | + if not n_estimators >= 1: |
| 172 | + raise exceptions.InvalidParameterValueException("N-estimators must be at least 1.") |
| 173 | + if max_depth is not None and not max_depth >= 1: |
| 174 | + raise exceptions.InvalidParameterValueException("Max depth must be at least 1 or None.") |
| 175 | + if not (0 < subsample <= 1): |
| 176 | + raise exceptions.InvalidParameterValueException("Subsample must be more than 0 and at most 1.") |
| 177 | + if verbose < 0: |
| 178 | + raise exceptions.InvalidParameterValueException("Verbose must be a non-negative number.") |
| 179 | + |
| 180 | + model = GradientBoostingRegressor( |
| 181 | + loss=loss, |
| 182 | + learning_rate=learning_rate, |
| 183 | + n_estimators=n_estimators, |
| 184 | + max_depth=max_depth, |
| 185 | + subsample=subsample, |
| 186 | + random_state=random_state, |
| 187 | + verbose=verbose, |
| 188 | + **kwargs, |
| 189 | + ) |
| 190 | + |
| 191 | + model, metrics = _train_and_validate_sklearn_model( |
| 192 | + X=X, |
| 193 | + y=y, |
| 194 | + model=model, |
| 195 | + validation_method=validation_method, |
| 196 | + metrics=metrics, |
| 197 | + split_size=split_size, |
| 198 | + cv_folds=cv_folds, |
| 199 | + random_state=random_state, |
| 200 | + ) |
| 201 | + |
| 202 | + return model, metrics |
0 commit comments