1
+ from numbers import Number
1
2
from pathlib import Path
2
3
3
4
import joblib
4
5
import numpy as np
5
6
import pandas as pd
6
7
from beartype import beartype
7
- from beartype .typing import Literal , Optional , Sequence , Tuple , Union
8
+ from beartype .typing import List , Literal , Optional , Sequence , Tuple , Union
9
+ from scipy import sparse
8
10
from sklearn .base import BaseEstimator , is_classifier , is_regressor
9
11
from sklearn .metrics import (
10
12
accuracy_score ,
16
18
recall_score ,
17
19
)
18
20
from sklearn .model_selection import KFold , LeaveOneOut , StratifiedKFold , train_test_split
21
+ from tensorflow import keras
19
22
20
23
from eis_toolkit import exceptions
21
24
@@ -52,6 +55,58 @@ def load_model(path: Path) -> BaseEstimator:
52
55
return joblib .load (path )
53
56
54
57
58
+ @beartype
59
+ def split_data (
60
+ * data : Union [np .ndarray , pd .DataFrame , sparse ._csr .csr_matrix , List [Number ]],
61
+ split_size : float = 0.2 ,
62
+ random_state : Optional [int ] = 42 ,
63
+ shuffle : bool = True ,
64
+ ) -> List [Union [np .ndarray , pd .DataFrame , sparse ._csr .csr_matrix , List [Number ]]]:
65
+ """
66
+ Split data into two parts.
67
+
68
+ For more guidance, read documentation of sklearn.model_selection.train_test_split:
69
+ (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
70
+
71
+ Args:
72
+ *data: Data to be split. Multiple datasets can be given as input (for example X and y),
73
+ but they need to have the same length. All datasets are split into two and the parts returned
74
+ (for example X_train, X_test, y_train, y_test).
75
+ split_size: The proportion of the second part of the split. Typically this is the size of test/validation
76
+ part. The first part will be complemental proportion. For example, if split_size = 0.2, the first part
77
+ will have 80% of the data and the second part 20% of the data. Defaults to 0.2.
78
+ random_state: Seed for random number generation. Defaults to 42.
79
+ shuffle: If data is shuffled before splitting. Defaults to True.
80
+
81
+ Returns:
82
+ List containing splits of inputs (two outputs per input).
83
+ """
84
+
85
+ if not (0 < split_size < 1 ):
86
+ raise exceptions .InvalidParameterValueException ("Split size must be more than 0 and less than 1." )
87
+
88
+ split_data = train_test_split (* data , test_size = split_size , random_state = random_state , shuffle = shuffle )
89
+
90
+ return split_data
91
+
92
+
93
+ @beartype
94
+ def predict (model : Union [BaseEstimator , keras .Model ], data : np .ndarray ) -> np .ndarray :
95
+ """
96
+ Predict with a trained model.
97
+
98
+ Args:
99
+ model: Trained classifier or regressor. Can be any machine learning model trained with
100
+ EIS Toolkit (Sklearn and Keras models).
101
+ data: Data used to make predictions.
102
+
103
+ Returns:
104
+ Predictions.
105
+ """
106
+ result = model .predict (data )
107
+ return result
108
+
109
+
55
110
@beartype
56
111
def _train_and_validate_sklearn_model (
57
112
X : Union [np .ndarray , pd .DataFrame ],
@@ -80,8 +135,6 @@ def _train_and_validate_sklearn_model(
80
135
)
81
136
if cv_folds < 2 :
82
137
raise exceptions .InvalidParameterValueException ("Number of cross-validation folds must be at least 2." )
83
- if not (0 < split_size < 1 ):
84
- raise exceptions .InvalidParameterValueException ("Split size must be more than 0 and less than 1." )
85
138
86
139
# Approach 1: No validation
87
140
if validation_method == NO_VALIDATION :
@@ -92,8 +145,8 @@ def _train_and_validate_sklearn_model(
92
145
93
146
# Approach 2: Validation with splitting data once
94
147
elif validation_method == SPLIT :
95
- X_train , X_valid , y_train , y_valid = train_test_split (
96
- X , y , test_size = split_size , random_state = random_state , shuffle = True
148
+ X_train , X_valid , y_train , y_valid = split_data (
149
+ X , y , split_size = split_size , random_state = random_state , shuffle = True
97
150
)
98
151
model .fit (X_train , y_train )
99
152
y_pred = model .predict (X_valid )
0 commit comments