Skip to content

Commit a9d3c6e

Browse files
committed
Change default random seed to None, add shuffle to parameters
1 parent 8c75aa3 commit a9d3c6e

File tree

5 files changed

+22
-23
lines changed

5 files changed

+22
-23
lines changed

eis_toolkit/prediction/gradient_boosting.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def gradient_boosting_classifier_train(
2424
max_depth: Optional[int] = 3,
2525
subsample: Number = 1.0,
2626
verbose: int = 0,
27-
random_state: Optional[int] = 42,
27+
random_state: Optional[int] = None,
2828
**kwargs,
2929
) -> Tuple[GradientBoostingClassifier, dict]:
3030
"""
@@ -61,7 +61,7 @@ def gradient_boosting_classifier_train(
6161
Values must be in the range 0.0 < x <= 1.0. Defaults to 1.0.
6262
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
6363
1 prints once in a while depending on the number of tress, 2 or above will print for every tree.
64-
random_state: Seed for random number generation. Defaults to 42.
64+
random_state: Seed for random number generation. Defaults to None.
6565
**kwargs: Additional parameters for Sklearn's GradientBoostingClassifier.
6666
6767
Returns:
@@ -120,7 +120,7 @@ def gradient_boosting_regressor_train(
120120
max_depth: Optional[int] = 3,
121121
subsample: Number = 1.0,
122122
verbose: int = 0,
123-
random_state: Optional[int] = 42,
123+
random_state: Optional[int] = None,
124124
**kwargs,
125125
) -> Tuple[GradientBoostingRegressor, dict]:
126126
"""
@@ -157,7 +157,7 @@ def gradient_boosting_regressor_train(
157157
Values must be in the range 0.0 < x <= 1.0. Defaults to 1.
158158
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
159159
1 prints once in a while depending on the number of tress, 2 or above will print for every tree.
160-
random_state: Seed for random number generation. Defaults to 42.
160+
random_state: Seed for random number generation. Defaults to None.
161161
**kwargs: Additional parameters for Sklearn's GradientBoostingRegressor.
162162
163163
Returns:

eis_toolkit/prediction/logistic_regression.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def logistic_regression_train(
2020
max_iter: int = 100,
2121
solver: Literal["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"] = "lbfgs",
2222
verbose: int = 0,
23-
random_state: Optional[int] = 42,
23+
random_state: Optional[int] = None,
2424
**kwargs
2525
) -> Tuple[LogisticRegression, dict]:
2626
"""
@@ -57,7 +57,7 @@ def logistic_regression_train(
5757
solver: Algorithm to use in the optimization problem. Defaults to 'lbfgs'.
5858
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
5959
values 1 or above will produce prints.
60-
random_state: Seed for random number generation. Defaults to 42.
60+
random_state: Seed for random number generation. Defaults to None.
6161
**kwargs: Additional parameters for Sklearn's LogisticRegression.
6262
6363
Returns:

eis_toolkit/prediction/model_utils.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def load_model(path: Path) -> BaseEstimator:
5959
def split_data(
6060
*data: Union[np.ndarray, pd.DataFrame, sparse._csr.csr_matrix, List[Number]],
6161
split_size: float = 0.2,
62-
random_state: Optional[int] = 42,
62+
random_state: Optional[int] = None,
6363
shuffle: bool = True,
6464
) -> List[Union[np.ndarray, pd.DataFrame, sparse._csr.csr_matrix, List[Number]]]:
6565
"""
@@ -75,7 +75,7 @@ def split_data(
7575
split_size: The proportion of the second part of the split. Typically this is the size of test/validation
7676
part. The first part will be complemental proportion. For example, if split_size = 0.2, the first part
7777
will have 80% of the data and the second part 20% of the data. Defaults to 0.2.
78-
random_state: Seed for random number generation. Defaults to 42.
78+
random_state: Seed for random number generation. Defaults to None.
7979
shuffle: If data is shuffled before splitting. Defaults to True.
8080
8181
Returns:
@@ -100,8 +100,6 @@ def test_model(
100100
"""
101101
Test and score a trained model.
102102
103-
TODO: Implement for Keras models.
104-
105103
Args:
106104
X_test: Test data.
107105
y_test: Target labels for test data.
@@ -157,7 +155,8 @@ def _train_and_validate_sklearn_model(
157155
metrics: Sequence[Literal["mse", "rmse", "mae", "r2", "accuracy", "precision", "recall", "f1"]],
158156
split_size: float = 0.2,
159157
cv_folds: int = 5,
160-
random_state: Optional[int] = 42,
158+
shuffle: bool = True,
159+
random_state: Optional[int] = None,
161160
) -> Tuple[BaseEstimator, dict]:
162161
"""
163162
Train and validate Sklearn model.
@@ -187,7 +186,7 @@ def _train_and_validate_sklearn_model(
187186
# Approach 2: Validation with splitting data once
188187
elif validation_method == SPLIT:
189188
X_train, X_valid, y_train, y_valid = split_data(
190-
X, y, split_size=split_size, random_state=random_state, shuffle=True
189+
X, y, split_size=split_size, random_state=random_state, shuffle=shuffle
191190
)
192191
model.fit(X_train, y_train)
193192
y_pred = model.predict(X_valid)
@@ -199,7 +198,7 @@ def _train_and_validate_sklearn_model(
199198

200199
# Approach 3: Cross-validation
201200
elif validation_method in [KFOLD_CV, SKFOLD_CV, LOO_CV]:
202-
cv = _get_cross_validator(validation_method, cv_folds, random_state)
201+
cv = _get_cross_validator(validation_method, cv_folds, shuffle, random_state)
203202

204203
# Initialize output metrics dictionary
205204
out_metrics = {}
@@ -284,13 +283,13 @@ def _score_model(
284283

285284
@beartype
286285
def _get_cross_validator(
287-
cv: str, folds: int, random_state: Optional[int]
286+
cv: str, folds: int, shuffle: bool, random_state: Optional[int]
288287
) -> Union[KFold, StratifiedKFold, LeaveOneOut]:
289288
"""Create and return a Sklearn cross-validator based on given parameter values."""
290289
if cv == KFOLD_CV:
291-
cross_validator = KFold(n_splits=folds, shuffle=True, random_state=random_state)
290+
cross_validator = KFold(n_splits=folds, shuffle=shuffle, random_state=random_state)
292291
elif cv == SKFOLD_CV:
293-
cross_validator = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
292+
cross_validator = StratifiedKFold(n_splits=folds, shuffle=shuffle, random_state=random_state)
294293
elif cv == LOO_CV:
295294
cross_validator = LeaveOneOut()
296295
else:

eis_toolkit/prediction/random_forests.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def random_forest_classifier_train(
1919
n_estimators: int = 100,
2020
max_depth: Optional[int] = None,
2121
verbose: int = 0,
22-
random_state: Optional[int] = 42,
22+
random_state: Optional[int] = None,
2323
**kwargs,
2424
) -> Tuple[RandomForestClassifier, dict]:
2525
"""
@@ -49,7 +49,7 @@ def random_forest_classifier_train(
4949
Defaults to None.
5050
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
5151
values 1 or above will produce prints.
52-
random_state: Seed for random number generation. Defaults to 42.
52+
random_state: Seed for random number generation. Defaults to None.
5353
**kwargs: Additional parameters for Sklearn's RandomForestClassifier.
5454
5555
Returns:
@@ -94,7 +94,7 @@ def random_forest_regressor_train(
9494
n_estimators: int = 100,
9595
max_depth: Optional[int] = None,
9696
verbose: int = 0,
97-
random_state: Optional[int] = 42,
97+
random_state: Optional[int] = None,
9898
**kwargs,
9999
) -> Tuple[RandomForestRegressor, dict]:
100100
"""
@@ -124,7 +124,7 @@ def random_forest_regressor_train(
124124
Defaults to None.
125125
verbose: Specifies if modeling progress and performance should be printed. 0 doesn't print,
126126
values 1 or above will produce prints.
127-
random_state: Seed for random number generation. Defaults to 42.
127+
random_state: Seed for random number generation. Defaults to None.
128128
**kwargs: Additional parameters for Sklearn's RandomForestRegressor.
129129
130130
Returns:

tests/prediction/model_utils_test.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_binary_classification():
103103

104104
def test_splitting():
105105
"""Test that split data works as expected."""
106-
X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2)
106+
X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2, random_state=42)
107107
np.testing.assert_equal(len(X_train), len(X_IRIS) * 0.8)
108108
np.testing.assert_equal(len(y_train), len(Y_IRIS) * 0.8)
109109
np.testing.assert_equal(len(X_test), len(X_IRIS) * 0.2)
@@ -112,7 +112,7 @@ def test_splitting():
112112

113113
def test_test_model_sklearn():
114114
"""Test that test model works as expected with a Sklearn model."""
115-
X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2)
115+
X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2, random_state=42)
116116

117117
model, _ = _train_and_validate_sklearn_model(
118118
X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42
@@ -124,7 +124,7 @@ def test_test_model_sklearn():
124124

125125
def test_predict_sklearn():
126126
"""Test that predict works as expected with a Sklearn model."""
127-
X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2)
127+
X_train, X_test, y_train, y_test = split_data(X_IRIS, Y_IRIS, split_size=0.2, random_state=42)
128128

129129
model, _ = _train_and_validate_sklearn_model(
130130
X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42

0 commit comments

Comments
 (0)