Skip to content

Commit 9becd2b

Browse files
committed
Clarified naming in modeling
- Renamed test -> validation where validation was meant - Renamed simple_split -> split - Renamed simple_split_size -> split_size
1 parent c7512df commit 9becd2b

File tree

5 files changed

+123
-115
lines changed

5 files changed

+123
-115
lines changed

eis_toolkit/prediction/gradient_boosting.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@
77
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
88

99
from eis_toolkit import exceptions
10-
from eis_toolkit.prediction.model_utils import _train_and_evaluate_sklearn_model
10+
from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model
1111

1212

1313
@beartype
1414
def gradient_boosting_classifier_train(
1515
X: Union[np.ndarray, pd.DataFrame],
1616
y: Union[np.ndarray, pd.Series],
17-
test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "simple_split",
17+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
1818
metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"],
19-
simple_split_size: float = 0.2,
19+
split_size: float = 0.2,
2020
cv_folds: int = 5,
2121
loss: Literal["log_loss", "exponential"] = "log_loss",
2222
learning_rate: Number = 0.1,
@@ -31,22 +31,22 @@ def gradient_boosting_classifier_train(
3131
Train and optionally validate a Gradient Boosting classifier model using Sklearn.
3232
3333
Various options and configurations for model performance evaluation are available. No validation,
34-
simple train-test and cross-validation can be chosen. If validation is performed, metric(s) to
35-
calculate can be defined and validation process configured (cross-validation method, number of folds,
36-
size of the simple train-test split). Depending on the details of the validation process, the output
37-
metrics dictionary can be empty, one-dimensional or nested.
34+
split to train and validation parts, and cross-validation can be chosen. If validation is performed,
35+
metric(s) to calculate can be defined and validation process configured (cross-validation method,
36+
number of folds, size of the split). Depending on the details of the validation process,
37+
the output metrics dictionary can be empty, one-dimensional or nested.
3838
3939
Args:
4040
X: Training data.
4141
y: Target labels.
42-
test_method: Test / validation method to use. "simple_split" divides data into two parts, "kfold_cv"
42+
validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
4343
performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
44-
"loo_cv" performs leave-one-out cross-validation and "none" will not test / validate model at all
44+
"loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
4545
(in this case, all X and y will be used solely for training).
4646
metrics: Metrics to use for scoring the model. Defaults to "accuracy".
47-
simple_split_size: Fraction of the dataset to be used as test data (rest is used for training).
48-
Used only when test_method is "simple_split". Defaults to 0.2.
49-
cv_folds: Number of folds used in cross-validation. Used only when test_method is "kfold_cv"
47+
split_size: Fraction of the dataset to be used as validation data (rest is used for training).
48+
Used only when validation_method is "split". Defaults to 0.2.
49+
cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
5050
or "skfold_cv". Defaults to 5.
5151
loss: The loss function to be optimized. Defaults to "log_loss" (same as in logistic regression).
5252
learning_rate: Shrinks the contribution of each tree. Values must be >= 0. Defaults to 0.1.
@@ -92,13 +92,13 @@ def gradient_boosting_classifier_train(
9292
**kwargs,
9393
)
9494

95-
model, metrics = _train_and_evaluate_sklearn_model(
95+
model, metrics = _train_and_validate_sklearn_model(
9696
X=X,
9797
y=y,
9898
model=model,
99-
test_method=test_method,
99+
validation_method=validation_method,
100100
metrics=metrics,
101-
simple_split_size=simple_split_size,
101+
split_size=split_size,
102102
cv_folds=cv_folds,
103103
random_state=random_state,
104104
)
@@ -110,9 +110,9 @@ def gradient_boosting_classifier_train(
110110
def gradient_boosting_regressor_train(
111111
X: Union[np.ndarray, pd.DataFrame],
112112
y: Union[np.ndarray, pd.Series],
113-
test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "simple_split",
113+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
114114
metrics: Sequence[Literal["mse", "rmse", "mae", "r2"]] = ["mse"],
115-
simple_split_size: float = 0.2,
115+
split_size: float = 0.2,
116116
cv_folds: int = 5,
117117
loss: Literal["squared_error", "absolute_error", "huber", "quantile"] = "squared_error",
118118
learning_rate: Number = 0.1,
@@ -127,22 +127,22 @@ def gradient_boosting_regressor_train(
127127
Train and optionally validate a Gradient Boosting regressor model using Sklearn.
128128
129129
Various options and configurations for model performance evaluation are available. No validation,
130-
simple train-test and cross-validation can be chosen. If validation is performed, metric(s) to
131-
calculate can be defined and validation process configured (cross-validation method, number of folds,
132-
size of the simple train-test split). Depending on the details of the validation process, the output
133-
metrics dictionary can be empty, one-dimensional or nested.
130+
split to train and validation parts, and cross-validation can be chosen. If validation is performed,
131+
metric(s) to calculate can be defined and validation process configured (cross-validation method,
132+
number of folds, size of the split). Depending on the details of the validation process,
133+
the output metrics dictionary can be empty, one-dimensional or nested.
134134
135135
Args:
136136
X: Training data.
137137
y: Target labels.
138-
test_method: Test / validation method to use. "simple_split" divides data into two parts, "kfold_cv"
138+
validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
139139
performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
140-
"loo_cv" performs leave-one-out cross-validation and "none" will not test / validate model at all
140+
"loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
141141
(in this case, all X and y will be used solely for training).
142142
metrics: Metrics to use for scoring the model. Defaults to "mse".
143-
simple_split_size: Fraction of the dataset to be used as test data (rest is used for training).
144-
Used only when test_method is "simple_split". Defaults to 0.2.
145-
cv_folds: Number of folds used in cross-validation. Used only when test_method is "kfold_cv"
143+
split_size: Fraction of the dataset to be used as validation data (rest is used for training).
144+
Used only when validation_method is "split". Defaults to 0.2.
145+
cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
146146
or "skfold_cv". Defaults to 5.
147147
loss: The loss function to be optimized. Defaults to "squared_error".
148148
learning_rate: Shrinks the contribution of each tree. Values must be > 0. Defaults to 0.1.
@@ -188,13 +188,13 @@ def gradient_boosting_regressor_train(
188188
**kwargs,
189189
)
190190

191-
model, metrics = _train_and_evaluate_sklearn_model(
191+
model, metrics = _train_and_validate_sklearn_model(
192192
X=X,
193193
y=y,
194194
model=model,
195-
test_method=test_method,
195+
validation_method=validation_method,
196196
metrics=metrics,
197-
simple_split_size=simple_split_size,
197+
split_size=split_size,
198198
cv_folds=cv_folds,
199199
random_state=random_state,
200200
)

eis_toolkit/prediction/logistic_regression.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55
from sklearn.linear_model import LogisticRegression
66

77
from eis_toolkit import exceptions
8-
from eis_toolkit.prediction.model_utils import _train_and_evaluate_sklearn_model
8+
from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model
99

1010

1111
@beartype
1212
def logistic_regression_train(
1313
X: Union[np.ndarray, pd.DataFrame],
1414
y: Union[np.ndarray, pd.Series],
15-
test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "simple_split",
15+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
1616
metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"],
17-
simple_split_size: float = 0.2,
17+
split_size: float = 0.2,
1818
cv_folds: int = 5,
1919
penalty: Literal["l1", "l2", "elasicnet", None] = "l2",
2020
max_iter: int = 100,
@@ -27,10 +27,10 @@ def logistic_regression_train(
2727
Train and optionally validate a Logistic Regression classifier model using Sklearn.
2828
2929
Various options and configurations for model performance evaluation are available. No validation,
30-
simple train-test and cross-validation can be chosen. If validation is performed, metric(s) to
31-
calculate can be defined and validation process configured (cross-validation method, number of folds,
32-
size of the simple train-test split). Depending on the details of the validation process, the output
33-
metrics dictionary can be empty, one-dimensional or nested.
30+
split to train and validation parts, and cross-validation can be chosen. If validation is performed,
31+
metric(s) to calculate can be defined and validation process configured (cross-validation method,
32+
number of folds, size of the split). Depending on the details of the validation process,
33+
the output metrics dictionary can be empty, one-dimensional or nested.
3434
3535
The choice of the algorithm depends on the penalty chosen. Supported penalties by solver:
3636
'lbfgs' - ['l2', None]
@@ -43,14 +43,14 @@ def logistic_regression_train(
4343
Args:
4444
X: Training data.
4545
y: Target labels.
46-
test_method: Test / validation method to use. "simple_split" divides data into two parts, "kfold_cv"
46+
validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
4747
performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
48-
"loo_cv" performs leave-one-out cross-validation and "none" will not test / validate model at all
48+
"loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
4949
(in this case, all X and y will be used solely for training).
5050
metrics: Metrics to use for scoring the model. Defaults to "accuracy".
51-
simple_split_size: Fraction of the dataset to be used as test data (rest is used for training).
52-
Used only when test_method is "simple_split". Defaults to 0.2.
53-
cv_folds: Number of folds used in cross-validation. Used only when test_method is "kfold_cv"
51+
split_size: Fraction of the dataset to be used as validation data (rest is used for training).
52+
Used only when validation_method is "split". Defaults to 0.2.
53+
cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
5454
or "skfold_cv". Defaults to 5.
5555
penalty: Specifies the norm of the penalty. Defaults to 'l2'.
5656
max_iter: Maximum number of iterations taken for the solvers to converge. Defaults to 100.
@@ -75,13 +75,13 @@ def logistic_regression_train(
7575
penalty=penalty, max_iter=max_iter, random_state=random_state, solver=solver, verbose=verbose, **kwargs
7676
)
7777

78-
model, metrics = _train_and_evaluate_sklearn_model(
78+
model, metrics = _train_and_validate_sklearn_model(
7979
X=X,
8080
y=y,
8181
model=model,
82-
test_method=test_method,
82+
validation_method=validation_method,
8383
metrics=metrics,
84-
simple_split_size=simple_split_size,
84+
split_size=split_size,
8585
cv_folds=cv_folds,
8686
random_state=random_state,
8787
)

eis_toolkit/prediction/model_utils.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from eis_toolkit import exceptions
2121

22-
SIMPLE_SPLIT = "simple_split"
22+
SPLIT = "split"
2323
KFOLD_CV = "kfold_cv"
2424
SKFOLD_CV = "skfold_cv"
2525
LOO_CV = "loo_cv"
@@ -53,18 +53,18 @@ def load_model(path: Path) -> BaseEstimator:
5353

5454

5555
@beartype
56-
def _train_and_evaluate_sklearn_model(
56+
def _train_and_validate_sklearn_model(
5757
X: Union[np.ndarray, pd.DataFrame],
5858
y: Union[np.ndarray, pd.Series],
5959
model: BaseEstimator,
60-
test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"],
60+
validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"],
6161
metrics: Sequence[Literal["mse", "rmse", "mae", "r2", "accuracy", "precision", "recall", "f1"]],
62-
simple_split_size: float = 0.2,
62+
split_size: float = 0.2,
6363
cv_folds: int = 5,
6464
random_state: Optional[int] = 42,
6565
) -> Tuple[BaseEstimator, dict]:
6666
"""
67-
Train and evaluate Sklearn model.
67+
Train and validate Sklearn model.
6868
6969
Serves as a common private/inner function for Random Forest, Logistic Regression and Gradient Boosting
7070
public functions.
@@ -74,38 +74,38 @@ def _train_and_evaluate_sklearn_model(
7474
x_size = X.index.size if isinstance(X, pd.DataFrame) else X.shape[0]
7575
if x_size != y.size:
7676
raise exceptions.NonMatchingParameterLengthsException(f"X and y must have the length {x_size} != {y.size}.")
77-
if len(metrics) == 0 and test_method != NO_VALIDATION:
77+
if len(metrics) == 0 and validation_method != NO_VALIDATION:
7878
raise exceptions.InvalidParameterValueException(
7979
"Metrics must have at least one chosen metric to validate model."
8080
)
8181
if cv_folds < 2:
8282
raise exceptions.InvalidParameterValueException("Number of cross-validation folds must be at least 2.")
83-
if not (0 < simple_split_size < 1):
84-
raise exceptions.InvalidParameterValueException("Test split must be more than 0 and less than 1.")
83+
if not (0 < split_size < 1):
84+
raise exceptions.InvalidParameterValueException("Split size must be more than 0 and less than 1.")
8585

8686
# Approach 1: No validation
87-
if test_method == NO_VALIDATION:
87+
if validation_method == NO_VALIDATION:
8888
model.fit(X, y)
8989
metrics = {}
9090

9191
return model, metrics
9292

93-
# Approach 2: Simple split
94-
elif test_method == SIMPLE_SPLIT:
95-
X_train, X_test, y_train, y_test = train_test_split(
96-
X, y, test_size=simple_split_size, random_state=random_state, shuffle=True
93+
# Approach 2: Validation with splitting data once
94+
elif validation_method == SPLIT:
95+
X_train, X_valid, y_train, y_valid = train_test_split(
96+
X, y, test_size=split_size, random_state=random_state, shuffle=True
9797
)
9898
model.fit(X_train, y_train)
99-
y_pred = model.predict(X_test)
99+
y_pred = model.predict(X_valid)
100100

101101
out_metrics = {}
102102
for metric in metrics:
103-
score = _score_model(model, y_test, y_pred, metric)
103+
score = _score_model(model, y_valid, y_pred, metric)
104104
out_metrics[metric] = score
105105

106106
# Approach 3: Cross-validation
107-
elif test_method in [KFOLD_CV, SKFOLD_CV, LOO_CV]:
108-
cv = _get_cross_validator(test_method, cv_folds, random_state)
107+
elif validation_method in [KFOLD_CV, SKFOLD_CV, LOO_CV]:
108+
cv = _get_cross_validator(validation_method, cv_folds, random_state)
109109

110110
# Initialize output metrics dictionary
111111
out_metrics = {}
@@ -114,12 +114,12 @@ def _train_and_evaluate_sklearn_model(
114114
out_metrics[metric][f"{metric}_all"] = []
115115

116116
# Loop over cross-validation folds and save metric scores
117-
for train_index, test_index in cv.split(X, y):
117+
for train_index, valid_index in cv.split(X, y):
118118
model.fit(X[train_index], y[train_index])
119-
y_pred = model.predict(X[test_index])
119+
y_pred = model.predict(X[valid_index])
120120

121121
for metric in metrics:
122-
score = _score_model(model, y[test_index], y_pred, metric)
122+
score = _score_model(model, y[valid_index], y_pred, metric)
123123
all_scores = out_metrics[metric][f"{metric}_all"]
124124
all_scores.append(score)
125125

@@ -137,7 +137,7 @@ def _train_and_evaluate_sklearn_model(
137137
out_metrics = out_metrics[metrics[0]]
138138

139139
else:
140-
raise Exception(f"Unrecognized test method: {test_method}")
140+
raise Exception(f"Unrecognized validation method: {validation_method}")
141141

142142
return model, out_metrics
143143

0 commit comments

Comments
 (0)