Skip to content

Commit 0b84fc9

Browse files
authored
Merge pull request #210 from GispoCoding/161-add-gradient-boosting-and-random-forests
161 add gradient boosting, logistic regression and random forests
2 parents 014b825 + a9d3c6e commit 0b84fc9

11 files changed

+1098
-0
lines changed

docs/prediction/gradient_boosting.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Gradient boosting
2+
3+
::: eis_toolkit.prediction.gradient_boosting
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Logistic regression
2+
3+
::: eis_toolkit.prediction.logistic_regression

docs/prediction/random_forests.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Random forests
2+
3+
::: eis_toolkit.prediction.random_forests
+202
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
from numbers import Number
2+
3+
import numpy as np
4+
import pandas as pd
5+
from beartype import beartype
6+
from beartype.typing import Literal, Optional, Sequence, Tuple, Union
7+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
8+
9+
from eis_toolkit import exceptions
10+
from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model
11+
12+
13+
@beartype
14+
def gradient_boosting_classifier_train(
15+
X: Union[np.ndarray, pd.DataFrame],
16+
y: Union[np.ndarray, pd.Series],
17+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
18+
metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"],
19+
split_size: float = 0.2,
20+
cv_folds: int = 5,
21+
loss: Literal["log_loss", "exponential"] = "log_loss",
22+
learning_rate: Number = 0.1,
23+
n_estimators: int = 100,
24+
max_depth: Optional[int] = 3,
25+
subsample: Number = 1.0,
26+
verbose: int = 0,
27+
random_state: Optional[int] = None,
28+
**kwargs,
29+
) -> Tuple[GradientBoostingClassifier, dict]:
30+
"""
31+
Train and optionally validate a Gradient Boosting classifier model using Sklearn.
32+
33+
Various options and configurations for model performance evaluation are available. No validation,
34+
split to train and validation parts, and cross-validation can be chosen. If validation is performed,
35+
metric(s) to calculate can be defined and validation process configured (cross-validation method,
36+
number of folds, size of the split). Depending on the details of the validation process,
37+
the output metrics dictionary can be empty, one-dimensional or nested.
38+
39+
Args:
40+
X: Training data.
41+
y: Target labels.
42+
validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
43+
performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
44+
"loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
45+
(in this case, all X and y will be used solely for training).
46+
metrics: Metrics to use for scoring the model. Defaults to "accuracy".
47+
split_size: Fraction of the dataset to be used as validation data (rest is used for training).
48+
Used only when validation_method is "split". Defaults to 0.2.
49+
cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
50+
or "skfold_cv". Defaults to 5.
51+
loss: The loss function to be optimized. Defaults to "log_loss" (same as in logistic regression).
52+
learning_rate: Shrinks the contribution of each tree. Values must be >= 0. Defaults to 0.1.
53+
n_estimators: The number of boosting stages to run. Gradient boosting is fairly robust to over-fitting
54+
so a large number can result in better performance. Values must be >= 1. Defaults to 100.
55+
max_depth: Maximum depth of the individual regression estimators. The maximum depth limits the number
56+
of nodes in the tree. Values must be >= 1 or None, in which case nodes are expanded until all leaves
57+
are pure or until all leaves contain less than min_samples_split samples. Defaults to 3.
58+
subsample: The fraction of samples to be used for fitting the individual base learners.
59+
If smaller than 1.0 this results in Stochastic Gradient Boosting. Subsample interacts with the
60+
parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.
61+
Values must be in the range 0.0 < x <= 1.0. Defaults to 1.0.
62+
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
63+
1 prints once in a while depending on the number of tress, 2 or above will print for every tree.
64+
random_state: Seed for random number generation. Defaults to None.
65+
**kwargs: Additional parameters for Sklearn's GradientBoostingClassifier.
66+
67+
Returns:
68+
The trained GradientBoostingClassifier and metric scores as a dictionary.
69+
70+
Raises:
71+
InvalidParameterValueException: If some of the numeric parameters are given invalid input values.
72+
"""
73+
if not learning_rate >= 0:
74+
raise exceptions.InvalidParameterValueException("Learning rate must be non-negative.")
75+
if not n_estimators >= 1:
76+
raise exceptions.InvalidParameterValueException("N-estimators must be at least 1.")
77+
if max_depth is not None and not max_depth >= 1:
78+
raise exceptions.InvalidParameterValueException("Max depth must be at least 1 or None.")
79+
if not (0 < subsample <= 1):
80+
raise exceptions.InvalidParameterValueException("Subsample must be more than 0 and at most 1.")
81+
if verbose < 0:
82+
raise exceptions.InvalidParameterValueException("Verbose must be a non-negative number.")
83+
84+
model = GradientBoostingClassifier(
85+
loss=loss,
86+
learning_rate=learning_rate,
87+
n_estimators=n_estimators,
88+
max_depth=max_depth,
89+
subsample=subsample,
90+
random_state=random_state,
91+
verbose=verbose,
92+
**kwargs,
93+
)
94+
95+
model, metrics = _train_and_validate_sklearn_model(
96+
X=X,
97+
y=y,
98+
model=model,
99+
validation_method=validation_method,
100+
metrics=metrics,
101+
split_size=split_size,
102+
cv_folds=cv_folds,
103+
random_state=random_state,
104+
)
105+
106+
return model, metrics
107+
108+
109+
@beartype
110+
def gradient_boosting_regressor_train(
111+
X: Union[np.ndarray, pd.DataFrame],
112+
y: Union[np.ndarray, pd.Series],
113+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
114+
metrics: Sequence[Literal["mse", "rmse", "mae", "r2"]] = ["mse"],
115+
split_size: float = 0.2,
116+
cv_folds: int = 5,
117+
loss: Literal["squared_error", "absolute_error", "huber", "quantile"] = "squared_error",
118+
learning_rate: Number = 0.1,
119+
n_estimators: int = 100,
120+
max_depth: Optional[int] = 3,
121+
subsample: Number = 1.0,
122+
verbose: int = 0,
123+
random_state: Optional[int] = None,
124+
**kwargs,
125+
) -> Tuple[GradientBoostingRegressor, dict]:
126+
"""
127+
Train and optionally validate a Gradient Boosting regressor model using Sklearn.
128+
129+
Various options and configurations for model performance evaluation are available. No validation,
130+
split to train and validation parts, and cross-validation can be chosen. If validation is performed,
131+
metric(s) to calculate can be defined and validation process configured (cross-validation method,
132+
number of folds, size of the split). Depending on the details of the validation process,
133+
the output metrics dictionary can be empty, one-dimensional or nested.
134+
135+
Args:
136+
X: Training data.
137+
y: Target labels.
138+
validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
139+
performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
140+
"loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
141+
(in this case, all X and y will be used solely for training).
142+
metrics: Metrics to use for scoring the model. Defaults to "mse".
143+
split_size: Fraction of the dataset to be used as validation data (rest is used for training).
144+
Used only when validation_method is "split". Defaults to 0.2.
145+
cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
146+
or "skfold_cv". Defaults to 5.
147+
loss: The loss function to be optimized. Defaults to "squared_error".
148+
learning_rate: Shrinks the contribution of each tree. Values must be > 0. Defaults to 0.1.
149+
n_estimators: The number of boosting stages to run. Gradient boosting is fairly robust to over-fitting
150+
so a large number can result in better performance. Values must be >= 1. Defaults to 100.
151+
max_depth: Maximum depth of the individual regression estimators. The maximum depth limits the number
152+
of nodes in the tree. Values must be >= 1 or None, in which case nodes are expanded until all leaves
153+
are pure or until all leaves contain less than min_samples_split samples. Defaults to 3.
154+
subsample: The fraction of samples to be used for fitting the individual base learners.
155+
If smaller than 1.0 this results in Stochastic Gradient Boosting. Subsample interacts with the
156+
parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.
157+
Values must be in the range 0.0 < x <= 1.0. Defaults to 1.
158+
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
159+
1 prints once in a while depending on the number of tress, 2 or above will print for every tree.
160+
random_state: Seed for random number generation. Defaults to None.
161+
**kwargs: Additional parameters for Sklearn's GradientBoostingRegressor.
162+
163+
Returns:
164+
The trained GradientBoostingRegressor and metric scores as a dictionary.
165+
166+
Raises:
167+
InvalidParameterValueException: If some of the numeric parameters are given invalid input values.
168+
"""
169+
if not learning_rate >= 0:
170+
raise exceptions.InvalidParameterValueException("Learning rate must be non-negative.")
171+
if not n_estimators >= 1:
172+
raise exceptions.InvalidParameterValueException("N-estimators must be at least 1.")
173+
if max_depth is not None and not max_depth >= 1:
174+
raise exceptions.InvalidParameterValueException("Max depth must be at least 1 or None.")
175+
if not (0 < subsample <= 1):
176+
raise exceptions.InvalidParameterValueException("Subsample must be more than 0 and at most 1.")
177+
if verbose < 0:
178+
raise exceptions.InvalidParameterValueException("Verbose must be a non-negative number.")
179+
180+
model = GradientBoostingRegressor(
181+
loss=loss,
182+
learning_rate=learning_rate,
183+
n_estimators=n_estimators,
184+
max_depth=max_depth,
185+
subsample=subsample,
186+
random_state=random_state,
187+
verbose=verbose,
188+
**kwargs,
189+
)
190+
191+
model, metrics = _train_and_validate_sklearn_model(
192+
X=X,
193+
y=y,
194+
model=model,
195+
validation_method=validation_method,
196+
metrics=metrics,
197+
split_size=split_size,
198+
cv_folds=cv_folds,
199+
random_state=random_state,
200+
)
201+
202+
return model, metrics
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import numpy as np
2+
import pandas as pd
3+
from beartype import beartype
4+
from beartype.typing import Literal, Optional, Sequence, Tuple, Union
5+
from sklearn.linear_model import LogisticRegression
6+
7+
from eis_toolkit import exceptions
8+
from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model
9+
10+
11+
@beartype
12+
def logistic_regression_train(
13+
X: Union[np.ndarray, pd.DataFrame],
14+
y: Union[np.ndarray, pd.Series],
15+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
16+
metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"],
17+
split_size: float = 0.2,
18+
cv_folds: int = 5,
19+
penalty: Literal["l1", "l2", "elasicnet", None] = "l2",
20+
max_iter: int = 100,
21+
solver: Literal["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"] = "lbfgs",
22+
verbose: int = 0,
23+
random_state: Optional[int] = None,
24+
**kwargs
25+
) -> Tuple[LogisticRegression, dict]:
26+
"""
27+
Train and optionally validate a Logistic Regression classifier model using Sklearn.
28+
29+
Various options and configurations for model performance evaluation are available. No validation,
30+
split to train and validation parts, and cross-validation can be chosen. If validation is performed,
31+
metric(s) to calculate can be defined and validation process configured (cross-validation method,
32+
number of folds, size of the split). Depending on the details of the validation process,
33+
the output metrics dictionary can be empty, one-dimensional or nested.
34+
35+
The choice of the algorithm depends on the penalty chosen. Supported penalties by solver:
36+
'lbfgs' - ['l2', None]
37+
'liblinear' - ['l1', 'l2']
38+
'newton-cg' - ['l2', None]
39+
'newton-cholesky' - ['l2', None]
40+
'sag' - ['l2', None]
41+
'saga' - ['elasticnet', 'l1', 'l2', None]
42+
43+
Args:
44+
X: Training data.
45+
y: Target labels.
46+
validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
47+
performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
48+
"loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
49+
(in this case, all X and y will be used solely for training).
50+
metrics: Metrics to use for scoring the model. Defaults to "accuracy".
51+
split_size: Fraction of the dataset to be used as validation data (rest is used for training).
52+
Used only when validation_method is "split". Defaults to 0.2.
53+
cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
54+
or "skfold_cv". Defaults to 5.
55+
penalty: Specifies the norm of the penalty. Defaults to 'l2'.
56+
max_iter: Maximum number of iterations taken for the solvers to converge. Defaults to 100.
57+
solver: Algorithm to use in the optimization problem. Defaults to 'lbfgs'.
58+
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
59+
values 1 or above will produce prints.
60+
random_state: Seed for random number generation. Defaults to None.
61+
**kwargs: Additional parameters for Sklearn's LogisticRegression.
62+
63+
Returns:
64+
The trained Logistric Regression classifier and metric scores as a dictionary.
65+
66+
Raises:
67+
InvalidParameterValueException: If some of the numeric parameters are given invalid input values.
68+
"""
69+
if max_iter < 1:
70+
raise exceptions.InvalidParameterValueException("Max iter must be > 0.")
71+
if verbose < 0:
72+
raise exceptions.InvalidParameterValueException("Verbose must be a non-negative number.")
73+
74+
model = LogisticRegression(
75+
penalty=penalty, max_iter=max_iter, random_state=random_state, solver=solver, verbose=verbose, **kwargs
76+
)
77+
78+
model, metrics = _train_and_validate_sklearn_model(
79+
X=X,
80+
y=y,
81+
model=model,
82+
validation_method=validation_method,
83+
metrics=metrics,
84+
split_size=split_size,
85+
cv_folds=cv_folds,
86+
random_state=random_state,
87+
)
88+
89+
return model, metrics

0 commit comments

Comments
 (0)