Added wrappers for splitting data and predicting (tests included)

nmaarnio · nmaarnio · commit ef247d3c1a79 · 2023-11-29T13:20:48.000+02:00
diff --git a/eis_toolkit/prediction/model_utils.py b/eis_toolkit/prediction/model_utils.py
@@ -1,10 +1,12 @@
+from numbers import Number
 from pathlib import Path
 
 import joblib
 import numpy as np
 import pandas as pd
 from beartype import beartype
-from beartype.typing import Literal, Optional, Sequence, Tuple, Union
+from beartype.typing import List, Literal, Optional, Sequence, Tuple, Union
+from scipy import sparse
 from sklearn.base import BaseEstimator, is_classifier, is_regressor
 from sklearn.metrics import (
     accuracy_score,
@@ -16,6 +18,7 @@
     recall_score,
 )
 from sklearn.model_selection import KFold, LeaveOneOut, StratifiedKFold, train_test_split
+from tensorflow import keras
 
 from eis_toolkit import exceptions
 
@@ -52,6 +55,58 @@ def load_model(path: Path) -> BaseEstimator:
     return joblib.load(path)
 
 
+@beartype
+def split_data(
+    *data: Union[np.ndarray, pd.DataFrame, sparse._csr.csr_matrix, List[Number]],
+    split_size: float = 0.2,
+    random_state: Optional[int] = 42,
+    shuffle: bool = True,
+) -> List[Union[np.ndarray, pd.DataFrame, sparse._csr.csr_matrix, List[Number]]]:
+    """
+    Split data into two parts.
+
+    For more guidance, read documentation of sklearn.model_selection.train_test_split:
+    (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
+
+    Args:
+        *data: Data to be split. Multiple datasets can be given as input (for example X and y),
+            but they need to have the same length. All datasets are split into two and the parts returned
+            (for example X_train, X_test, y_train, y_test).
+        split_size: The proportion of the second part of the split. Typically this is the size of test/validation
+            part. The first part will be complemental proportion. For example, if split_size = 0.2, the first part
+            will have 80% of the data and the second part 20% of the data. Defaults to 0.2.
+        random_state: Seed for random number generation. Defaults to 42.
+        shuffle: If data is shuffled before splitting. Defaults to True.
+
+    Returns:
+        List containing splits of inputs (two outputs per input).
+    """
+
+    if not (0 < split_size < 1):
+        raise exceptions.InvalidParameterValueException("Split size must be more than 0 and less than 1.")
+
+    split_data = train_test_split(*data, test_size=split_size, random_state=random_state, shuffle=shuffle)
+
+    return split_data
+
+
+@beartype
+def predict(model: Union[BaseEstimator, keras.Model], data: np.ndarray) -> np.ndarray:
+    """
+    Predict with a trained model.
+
+    Args:
+        model: Trained classifier or regressor. Can be any machine learning model trained with
+            EIS Toolkit (Sklearn and Keras models).
+        data: Data used to make predictions.
+
+    Returns:
+        Predictions.
+    """
+    result = model.predict(data)
+    return result
+
+
 @beartype
 def _train_and_validate_sklearn_model(
     X: Union[np.ndarray, pd.DataFrame],
@@ -80,8 +135,6 @@ def _train_and_validate_sklearn_model(
         )
     if cv_folds < 2:
         raise exceptions.InvalidParameterValueException("Number of cross-validation folds must be at least 2.")
-    if not (0 < split_size < 1):
-        raise exceptions.InvalidParameterValueException("Split size must be more than 0 and less than 1.")
 
     # Approach 1: No validation
     if validation_method == NO_VALIDATION:
@@ -92,8 +145,8 @@ def _train_and_validate_sklearn_model(
 
     # Approach 2: Validation with splitting data once
     elif validation_method == SPLIT:
-        X_train, X_valid, y_train, y_valid = train_test_split(
-            X, y, test_size=split_size, random_state=random_state, shuffle=True
+        X_train, X_valid, y_train, y_valid = split_data(
+            X, y, split_size=split_size, random_state=random_state, shuffle=True
         )
         model.fit(X_train, y_train)
         y_pred = model.predict(X_valid)
diff --git a/tests/prediction/model_utils_test.py b/tests/prediction/model_utils_test.py
@@ -7,7 +7,13 @@
 from sklearn.ensemble import RandomForestClassifier
 
 from eis_toolkit import exceptions
-from eis_toolkit.prediction.model_utils import _train_and_validate_sklearn_model, load_model, save_model
+from eis_toolkit.prediction.model_utils import (
+    _train_and_validate_sklearn_model,
+    load_model,
+    predict,
+    save_model,
+    split_data,
+)
 
 TEST_DIR = Path(__file__).parent.parent
 
@@ -94,6 +100,27 @@ def test_binary_classification():
     assert len(out_metrics) == 4
 
 
+def test_splitting():
+    """Test that split data works as expected."""
+    X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2)
+    np.testing.assert_equal(len(X_train), len(X_IRIS) * 0.8)
+    np.testing.assert_equal(len(y_train), len(Y_IRIS) * 0.8)
+    np.testing.assert_equal(len(X_test), len(X_IRIS) * 0.2)
+    np.testing.assert_equal(len(y_test), len(Y_IRIS) * 0.2)
+
+
+def test_predict_sklearn():
+    """Test that predict works as expected with a Sklearn model."""
+    X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2)
+
+    model, _ = _train_and_validate_sklearn_model(
+        X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42
+    )
+
+    predicted_labels = predict(model, X_test)
+    assert len(predicted_labels) == len(y_test)
+
+
 def test_save_and_load_model():
     """Test that saving and loading a model works as expected."""
     model_save_path = TEST_DIR.joinpath("data/local/results/saved_rf_model.joblib")