GispoCoding
diff --git a/‎eis_toolkit/prediction/gradient_boosting.py
Lines changed: 29 additions & 29 deletions b/‎eis_toolkit/prediction/gradient_boosting.py
Lines changed: 29 additions & 29 deletions
diff --git a/‎eis_toolkit/prediction/logistic_regression.py
Lines changed: 15 additions & 15 deletions b/‎eis_toolkit/prediction/logistic_regression.py
Lines changed: 15 additions & 15 deletions
diff --git a/‎eis_toolkit/prediction/model_utils.py
Lines changed: 21 additions & 21 deletions b/‎eis_toolkit/prediction/model_utils.py
Lines changed: 21 additions & 21 deletions
@@ -7,16 +7,16 @@
 from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
 
 from eis_toolkit import exceptions
-from eis_toolkit.prediction.model_utils import _train_and_evaluate_sklearn_model
+from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model
 
 
 @beartype
 def gradient_boosting_classifier_train(
     X: Union[np.ndarray, pd.DataFrame],
     y: Union[np.ndarray, pd.Series],
-    test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "simple_split",
+    validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
     metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"],
-    simple_split_size: float = 0.2,
+    split_size: float = 0.2,
     cv_folds: int = 5,
     loss: Literal["log_loss", "exponential"] = "log_loss",
     learning_rate: Number = 0.1,
@@ -31,22 +31,22 @@ def gradient_boosting_classifier_train(
     Train and optionally validate a Gradient Boosting classifier model using Sklearn.
 
     Various options and configurations for model performance evaluation are available. No validation,
-    simple train-test and cross-validation can be chosen. If validation is performed, metric(s) to
-    calculate can be defined and validation process configured (cross-validation method, number of folds,
-    size of the simple train-test split). Depending on the details of the validation process, the output
-    metrics dictionary can be empty, one-dimensional or nested.
+    split to train and validation parts, and cross-validation can be chosen. If validation is performed,
+    metric(s) to calculate can be defined and validation process configured (cross-validation method,
+    number of folds, size of the split). Depending on the details of the validation process,
+    the output metrics dictionary can be empty, one-dimensional or nested.
 
     Args:
         X: Training data.
         y: Target labels.
-        test_method: Test / validation method to use. "simple_split" divides data into two parts, "kfold_cv"
+        validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
             performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
-            "loo_cv" performs leave-one-out cross-validation and "none" will not test / validate model at all
+            "loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
             (in this case, all X and y will be used solely for training).
         metrics: Metrics to use for scoring the model. Defaults to "accuracy".
-        simple_split_size: Fraction of the dataset to be used as test data (rest is used for training).
-            Used only when test_method is "simple_split". Defaults to 0.2.
-        cv_folds: Number of folds used in cross-validation. Used only when test_method is "kfold_cv"
+        split_size: Fraction of the dataset to be used as validation data (rest is used for training).
+            Used only when validation_method is "split". Defaults to 0.2.
+        cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
             or "skfold_cv". Defaults to 5.
         loss: The loss function to be optimized. Defaults to "log_loss" (same as in logistic regression).
         learning_rate: Shrinks the contribution of each tree. Values must be >= 0. Defaults to 0.1.
@@ -92,13 +92,13 @@ def gradient_boosting_classifier_train(
         **kwargs,
     )
 
-    model, metrics = _train_and_evaluate_sklearn_model(
+    model, metrics = _train_and_validate_sklearn_model(
         X=X,
         y=y,
         model=model,
-        test_method=test_method,
+        validation_method=validation_method,
         metrics=metrics,
-        simple_split_size=simple_split_size,
+        split_size=split_size,
         cv_folds=cv_folds,
         random_state=random_state,
     )
@@ -110,9 +110,9 @@ def gradient_boosting_classifier_train(
 def gradient_boosting_regressor_train(
     X: Union[np.ndarray, pd.DataFrame],
     y: Union[np.ndarray, pd.Series],
-    test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "simple_split",
+    validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
     metrics: Sequence[Literal["mse", "rmse", "mae", "r2"]] = ["mse"],
-    simple_split_size: float = 0.2,
+    split_size: float = 0.2,
     cv_folds: int = 5,
     loss: Literal["squared_error", "absolute_error", "huber", "quantile"] = "squared_error",
     learning_rate: Number = 0.1,
@@ -127,22 +127,22 @@ def gradient_boosting_regressor_train(
     Train and optionally validate a Gradient Boosting regressor model using Sklearn.
 
     Various options and configurations for model performance evaluation are available. No validation,
-    simple train-test and cross-validation can be chosen. If validation is performed, metric(s) to
-    calculate can be defined and validation process configured (cross-validation method, number of folds,
-    size of the simple train-test split). Depending on the details of the validation process, the output
-    metrics dictionary can be empty, one-dimensional or nested.
+    split to train and validation parts, and cross-validation can be chosen. If validation is performed,
+    metric(s) to calculate can be defined and validation process configured (cross-validation method,
+    number of folds, size of the split). Depending on the details of the validation process,
+    the output metrics dictionary can be empty, one-dimensional or nested.
 
     Args:
         X: Training data.
         y: Target labels.
-        test_method: Test / validation method to use. "simple_split" divides data into two parts, "kfold_cv"
+        validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
             performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
-            "loo_cv" performs leave-one-out cross-validation and "none" will not test / validate model at all
+            "loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
             (in this case, all X and y will be used solely for training).
         metrics: Metrics to use for scoring the model. Defaults to "mse".
-        simple_split_size: Fraction of the dataset to be used as test data (rest is used for training).
-            Used only when test_method is "simple_split". Defaults to 0.2.
-        cv_folds: Number of folds used in cross-validation. Used only when test_method is "kfold_cv"
+        split_size: Fraction of the dataset to be used as validation data (rest is used for training).
+            Used only when validation_method is "split". Defaults to 0.2.
+        cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
             or "skfold_cv". Defaults to 5.
         loss: The loss function to be optimized. Defaults to "squared_error".
         learning_rate: Shrinks the contribution of each tree. Values must be > 0. Defaults to 0.1.
@@ -188,13 +188,13 @@ def gradient_boosting_regressor_train(
         **kwargs,
     )
 
-    model, metrics = _train_and_evaluate_sklearn_model(
+    model, metrics = _train_and_validate_sklearn_model(
         X=X,
         y=y,
         model=model,
-        test_method=test_method,
+        validation_method=validation_method,
         metrics=metrics,
-        simple_split_size=simple_split_size,
+        split_size=split_size,
         cv_folds=cv_folds,
         random_state=random_state,
     )
 
@@ -5,16 +5,16 @@
 from sklearn.linear_model import LogisticRegression
 
 from eis_toolkit import exceptions
-from eis_toolkit.prediction.model_utils import _train_and_evaluate_sklearn_model
+from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model
 
 
 @beartype
 def logistic_regression_train(
     X: Union[np.ndarray, pd.DataFrame],
     y: Union[np.ndarray, pd.Series],
-    test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "simple_split",
+    validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"] = "split",
     metrics: Sequence[Literal["accuracy", "precision", "recall", "f1", "auc"]] = ["accuracy"],
-    simple_split_size: float = 0.2,
+    split_size: float = 0.2,
     cv_folds: int = 5,
     penalty: Literal["l1", "l2", "elasicnet", None] = "l2",
     max_iter: int = 100,
@@ -27,10 +27,10 @@ def logistic_regression_train(
     Train and optionally validate a Logistic Regression classifier model using Sklearn.
 
     Various options and configurations for model performance evaluation are available. No validation,
-    simple train-test and cross-validation can be chosen. If validation is performed, metric(s) to
-    calculate can be defined and validation process configured (cross-validation method, number of folds,
-    size of the simple train-test split). Depending on the details of the validation process, the output
-    metrics dictionary can be empty, one-dimensional or nested.
+    split to train and validation parts, and cross-validation can be chosen. If validation is performed,
+    metric(s) to calculate can be defined and validation process configured (cross-validation method,
+    number of folds, size of the split). Depending on the details of the validation process,
+    the output metrics dictionary can be empty, one-dimensional or nested.
 
     The choice of the algorithm depends on the penalty chosen. Supported penalties by solver:
     'lbfgs' - ['l2', None]
@@ -43,14 +43,14 @@ def logistic_regression_train(
     Args:
         X: Training data.
         y: Target labels.
-        test_method: Test / validation method to use. "simple_split" divides data into two parts, "kfold_cv"
+        validation_method: Validation method to use. "split" divides data into two parts, "kfold_cv"
             performs k-fold cross-validation, "skfold_cv" performs stratified k-fold cross-validation,
-            "loo_cv" performs leave-one-out cross-validation and "none" will not test / validate model at all
+            "loo_cv" performs leave-one-out cross-validation and "none" will not validate model at all
             (in this case, all X and y will be used solely for training).
         metrics: Metrics to use for scoring the model. Defaults to "accuracy".
-        simple_split_size: Fraction of the dataset to be used as test data (rest is used for training).
-            Used only when test_method is "simple_split". Defaults to 0.2.
-        cv_folds: Number of folds used in cross-validation. Used only when test_method is "kfold_cv"
+        split_size: Fraction of the dataset to be used as validation data (rest is used for training).
+            Used only when validation_method is "split". Defaults to 0.2.
+        cv_folds: Number of folds used in cross-validation. Used only when validation_method is "kfold_cv"
             or "skfold_cv". Defaults to 5.
         penalty: Specifies the norm of the penalty. Defaults to 'l2'.
         max_iter: Maximum number of iterations taken for the solvers to converge. Defaults to 100.
@@ -75,13 +75,13 @@ def logistic_regression_train(
         penalty=penalty, max_iter=max_iter, random_state=random_state, solver=solver, verbose=verbose, **kwargs
     )
 
-    model, metrics = _train_and_evaluate_sklearn_model(
+    model, metrics = _train_and_validate_sklearn_model(
         X=X,
         y=y,
         model=model,
-        test_method=test_method,
+        validation_method=validation_method,
         metrics=metrics,
-        simple_split_size=simple_split_size,
+        split_size=split_size,
         cv_folds=cv_folds,
         random_state=random_state,
     )
 
@@ -19,7 +19,7 @@
 
 from eis_toolkit import exceptions
 
-SIMPLE_SPLIT = "simple_split"
+SPLIT = "split"
 KFOLD_CV = "kfold_cv"
 SKFOLD_CV = "skfold_cv"
 LOO_CV = "loo_cv"
@@ -53,18 +53,18 @@ def load_model(path: Path) -> BaseEstimator:
 
 
 @beartype
-def _train_and_evaluate_sklearn_model(
+def _train_and_validate_sklearn_model(
     X: Union[np.ndarray, pd.DataFrame],
     y: Union[np.ndarray, pd.Series],
     model: BaseEstimator,
-    test_method: Literal["simple_split", "kfold_cv", "skfold_cv", "loo_cv", "none"],
+    validation_method: Literal["split", "kfold_cv", "skfold_cv", "loo_cv", "none"],
     metrics: Sequence[Literal["mse", "rmse", "mae", "r2", "accuracy", "precision", "recall", "f1"]],
-    simple_split_size: float = 0.2,
+    split_size: float = 0.2,
     cv_folds: int = 5,
     random_state: Optional[int] = 42,
 ) -> Tuple[BaseEstimator, dict]:
     """
-    Train and evaluate Sklearn model.
+    Train and validate Sklearn model.
 
     Serves as a common private/inner function for Random Forest, Logistic Regression and Gradient Boosting
     public functions.
@@ -74,38 +74,38 @@ def _train_and_evaluate_sklearn_model(
     x_size = X.index.size if isinstance(X, pd.DataFrame) else X.shape[0]
     if x_size != y.size:
         raise exceptions.NonMatchingParameterLengthsException(f"X and y must have the length {x_size} != {y.size}.")
-    if len(metrics) == 0 and test_method != NO_VALIDATION:
+    if len(metrics) == 0 and validation_method != NO_VALIDATION:
         raise exceptions.InvalidParameterValueException(
             "Metrics must have at least one chosen metric to validate model."
         )
     if cv_folds < 2:
         raise exceptions.InvalidParameterValueException("Number of cross-validation folds must be at least 2.")
-    if not (0 < simple_split_size < 1):
-        raise exceptions.InvalidParameterValueException("Test split must be more than 0 and less than 1.")
+    if not (0 < split_size < 1):
+        raise exceptions.InvalidParameterValueException("Split size must be more than 0 and less than 1.")
 
     # Approach 1: No validation
-    if test_method == NO_VALIDATION:
+    if validation_method == NO_VALIDATION:
         model.fit(X, y)
         metrics = {}
 
         return model, metrics
 
-    # Approach 2: Simple split
-    elif test_method == SIMPLE_SPLIT:
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=simple_split_size, random_state=random_state, shuffle=True
+    # Approach 2: Validation with splitting data once
+    elif validation_method == SPLIT:
+        X_train, X_valid, y_train, y_valid = train_test_split(
+            X, y, test_size=split_size, random_state=random_state, shuffle=True
         )
         model.fit(X_train, y_train)
-        y_pred = model.predict(X_test)
+        y_pred = model.predict(X_valid)
 
         out_metrics = {}
         for metric in metrics:
-            score = _score_model(model, y_test, y_pred, metric)
+            score = _score_model(model, y_valid, y_pred, metric)
             out_metrics[metric] = score
 
     # Approach 3: Cross-validation
-    elif test_method in [KFOLD_CV, SKFOLD_CV, LOO_CV]:
-        cv = _get_cross_validator(test_method, cv_folds, random_state)
+    elif validation_method in [KFOLD_CV, SKFOLD_CV, LOO_CV]:
+        cv = _get_cross_validator(validation_method, cv_folds, random_state)
 
         # Initialize output metrics dictionary
         out_metrics = {}
@@ -114,12 +114,12 @@ def _train_and_evaluate_sklearn_model(
             out_metrics[metric][f"{metric}_all"] = []
 
         # Loop over cross-validation folds and save metric scores
-        for train_index, test_index in cv.split(X, y):
+        for train_index, valid_index in cv.split(X, y):
             model.fit(X[train_index], y[train_index])
-            y_pred = model.predict(X[test_index])
+            y_pred = model.predict(X[valid_index])
 
             for metric in metrics:
-                score = _score_model(model, y[test_index], y_pred, metric)
+                score = _score_model(model, y[valid_index], y_pred, metric)
                 all_scores = out_metrics[metric][f"{metric}_all"]
                 all_scores.append(score)
 
@@ -137,7 +137,7 @@ def _train_and_evaluate_sklearn_model(
             out_metrics = out_metrics[metrics[0]]
 
     else:
-        raise Exception(f"Unrecognized test method: {test_method}")
+        raise Exception(f"Unrecognized validation method: {validation_method}")
 
     return model, out_metrics