Merge pull request #390 from GispoCoding/389-separate-predicting-and-testing-cli-functions-for-classifier-and-regressor-models

nmaarnio · web-flow · commit e7354cac73a1 · 2024-05-21T08:51:10.000+03:00
389 refactor ML model predicting and testing
diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py
@@ -2313,97 +2313,162 @@ def gradient_boosting_regressor_train_cli(
     typer.echo("Gradient boosting regressor training completed")
 
 
-# EVALUATE ML MODEL
+# TEST CLASSIFIER ML MODEL
 @app.command()
-def evaluate_trained_model_cli(
+def classifier_test_cli(
     input_rasters: INPUT_FILES_ARGUMENT,
     target_labels: INPUT_FILE_OPTION,
     model_file: INPUT_FILE_OPTION,
-    output_raster: OUTPUT_FILE_OPTION,
-    validation_metrics: Annotated[List[str], typer.Option()],
+    output_raster_probability: OUTPUT_FILE_OPTION,
+    output_raster_classified: OUTPUT_FILE_OPTION,
+    classification_threshold: float = 0.5,
+    test_metrics: Annotated[List[ClassifierMetrics], typer.Option(case_sensitive=False)] = [ClassifierMetrics.accuracy],
 ):
-    """Predict and evaluate a trained machine learning model by predicting and scoring."""
-    from sklearn.base import is_classifier
-
+    """Test trained machine learning classifier model by predicting and scoring."""
     from eis_toolkit.evaluation.scoring import score_predictions
     from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
-    from eis_toolkit.prediction.machine_learning_predict import predict_classifier, predict_regressor
+    from eis_toolkit.prediction.machine_learning_predict import predict_classifier
 
     X, y, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters, target_labels)
     typer.echo("Progress: 30%")
 
     model = load_model(model_file)
-    if is_classifier(model):
-        predictions, probabilities = predict_classifier(X, model, True)
-        probabilities = probabilities[:, 1]
-        probabilities = probabilities.astype(np.float32)
-        probabilities_reshaped = reshape_predictions(
-            probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
+    predictions, probabilities = predict_classifier(X, model, classification_threshold, True)
+    probabilities_reshaped = reshape_predictions(
+        probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
+    )
+    predictions_reshaped = reshape_predictions(
+        predictions, reference_profile["height"], reference_profile["width"], nodata_mask
+    )
+
+    metrics_dict = score_predictions(y, predictions, get_enum_values(test_metrics))
+    typer.echo("Progress: 80%")
+
+    out_profile = reference_profile.copy()
+    out_profile.update({"count": 1, "dtype": np.float32})
+
+    with rasterio.open(output_raster_probability, "w", **out_profile) as dst:
+        dst.write(probabilities_reshaped, 1)
+    with rasterio.open(output_raster_classified, "w", **out_profile) as dst:
+        dst.write(predictions_reshaped, 1)
+
+    typer.echo("\n")
+    for key, value in metrics_dict.items():
+        typer.echo(f"{key}: {value}")
+    typer.echo("\n")
+
+    typer.echo("Progress: 100%")
+    typer.echo(
+        (
+            "Testing classifier model completed, writing rasters to "
+            f"{output_raster_probability} and {output_raster_classified}."
         )
-    else:
-        predictions = predict_regressor(X, model)
+    )
 
-    metrics_dict = score_predictions(y, predictions, validation_metrics)
 
+# TEST REGRESSOR ML MODEL
+@app.command()
+def regressor_test_cli(
+    input_rasters: INPUT_FILES_ARGUMENT,
+    target_labels: INPUT_FILE_OPTION,
+    model_file: INPUT_FILE_OPTION,
+    output_raster: OUTPUT_FILE_OPTION,
+    test_metrics: Annotated[List[RegressorMetrics], typer.Option(case_sensitive=False)] = [RegressorMetrics.mse],
+):
+    """Test trained machine learning regressor model by predicting and scoring."""
+    from eis_toolkit.evaluation.scoring import score_predictions
+    from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
+    from eis_toolkit.prediction.machine_learning_predict import predict_regressor
+
+    X, y, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters, target_labels)
+    typer.echo("Progress: 30%")
+
+    model = load_model(model_file)
+    predictions = predict_regressor(X, model)
     predictions_reshaped = reshape_predictions(
         predictions, reference_profile["height"], reference_profile["width"], nodata_mask
     )
 
+    metrics_dict = score_predictions(y, predictions, get_enum_values(test_metrics))
     typer.echo("Progress: 80%")
 
-    json_str = json.dumps(metrics_dict)
-
     out_profile = reference_profile.copy()
     out_profile.update({"count": 1, "dtype": np.float32})
 
-    if is_classifier(model):
-        directory = os.path.split(output_raster)[0]
-        name = os.path.splitext(os.path.basename(output_raster))[0]
-        labels_output = os.path.join(directory, name + "_labels" + ".tif")
-        probabilities_output = os.path.join(directory, name + "_probabilities" + ".tif")
-        for output_path, output_data in zip(
-            [labels_output, probabilities_output], [predictions_reshaped, probabilities_reshaped]
-        ):
-            with rasterio.open(output_path, "w", **out_profile) as dst:
-                dst.write(output_data, 1)
-    else:
-        with rasterio.open(output_raster, "w", **out_profile) as dst:
-            dst.write(predictions_reshaped, 1)
+    with rasterio.open(output_raster, "w", **out_profile) as dst:
+        dst.write(predictions_reshaped, 1)
 
-    typer.echo("Progress: 100%")
-    typer.echo(f"Results: {json_str}")
+    typer.echo("\n")
+    for key, value in metrics_dict.items():
+        typer.echo(f"{key}: {value}")
+    typer.echo("\n")
 
-    typer.echo("Evaluating trained model completed")
+    typer.echo("Progress: 100%\n")
+
+    typer.echo(f"Testing regressor model completed, writing raster to {output_raster}.")
 
 
 # PREDICT WITH TRAINED ML MODEL
 @app.command()
-def predict_with_trained_model_cli(
+def classifier_predict_cli(
     input_rasters: INPUT_FILES_ARGUMENT,
     model_file: INPUT_FILE_OPTION,
-    output_raster: OUTPUT_FILE_OPTION,
+    output_raster_probability: OUTPUT_FILE_OPTION,
+    output_raster_classified: OUTPUT_FILE_OPTION,
+    classification_threshold: float = 0.5,
 ):
-    """Predict with a trained machine learning model."""
-    from sklearn.base import is_classifier
-
+    """Predict with a trained machine learning classifier model."""
     from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
-    from eis_toolkit.prediction.machine_learning_predict import predict_classifier, predict_regressor
+    from eis_toolkit.prediction.machine_learning_predict import predict_classifier
 
     X, _, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters)
 
     typer.echo("Progress: 30%")
 
     model = load_model(model_file)
-    if is_classifier(model):
-        predictions, probabilities = predict_classifier(X, model, True)
-        probabilities = probabilities[:, 1]
-        probabilities = probabilities.astype(np.float32)
-        probabilities_reshaped = reshape_predictions(
-            probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
+    predictions, probabilities = predict_classifier(X, model, classification_threshold, True)
+    probabilities_reshaped = reshape_predictions(
+        probabilities, reference_profile["height"], reference_profile["width"], nodata_mask
+    )
+    predictions_reshaped = reshape_predictions(
+        predictions, reference_profile["height"], reference_profile["width"], nodata_mask
+    )
+    typer.echo("Progress: 80%")
+
+    out_profile = reference_profile.copy()
+    out_profile.update({"count": 1, "dtype": np.float32})
+
+    with rasterio.open(output_raster_probability, "w", **out_profile) as dst:
+        dst.write(probabilities_reshaped, 1)
+    with rasterio.open(output_raster_classified, "w", **out_profile) as dst:
+        dst.write(predictions_reshaped, 1)
+
+    typer.echo("Progress: 100%")
+    typer.echo(
+        (
+            "Predicting with classifier model completed, writing rasters to "
+            f"{output_raster_probability} and {output_raster_classified}."
         )
-    else:
-        predictions = predict_regressor(X, model)
+    )
+
 
+# PREDICT WITH TRAINED ML MODEL
+@app.command()
+def regressor_predict_cli(
+    input_rasters: INPUT_FILES_ARGUMENT,
+    model_file: INPUT_FILE_OPTION,
+    output_raster: OUTPUT_FILE_OPTION,
+):
+    """Predict with a trained machine learning regressor model."""
+    from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml, reshape_predictions
+    from eis_toolkit.prediction.machine_learning_predict import predict_regressor
+
+    X, _, reference_profile, nodata_mask = prepare_data_for_ml(input_rasters)
+
+    typer.echo("Progress: 30%")
+
+    model = load_model(model_file)
+    predictions = predict_regressor(X, model)
     predictions_reshaped = reshape_predictions(
         predictions, reference_profile["height"], reference_profile["width"], nodata_mask
     )
@@ -2413,22 +2478,11 @@ def predict_with_trained_model_cli(
     out_profile = reference_profile.copy()
     out_profile.update({"count": 1, "dtype": np.float32})
 
-    if is_classifier(model):
-        directory = os.path.split(output_raster)[0]
-        name = os.path.splitext(os.path.basename(output_raster))[0]
-        labels_output = os.path.join(directory, name + "_labels" + ".tif")
-        probabilities_output = os.path.join(directory, name + "_probabilities" + ".tif")
-        for output_path, output_data in zip(
-            [labels_output, probabilities_output], [predictions_reshaped, probabilities_reshaped]
-        ):
-            with rasterio.open(output_path, "w", **out_profile) as dst:
-                dst.write(output_data, 1)
-    else:
-        with rasterio.open(output_raster, "w", **out_profile) as dst:
-            dst.write(predictions_reshaped, 1)
+    with rasterio.open(output_raster, "w", **out_profile) as dst:
+        dst.write(predictions_reshaped, 1)
 
     typer.echo("Progress: 100%")
-    typer.echo("Predicting completed")
+    typer.echo(f"Predicting with regressor model completed, writing raster to {output_raster}.")
 
 
 # FUZZY OVERLAYS
diff --git a/eis_toolkit/exceptions.py b/eis_toolkit/exceptions.py
@@ -34,6 +34,10 @@ class InvalidDataShapeException(Exception):
     """Exception error for datasets with invalid shapes."""
 
 
+class InvalidModelTypeException(Exception):
+    """Exception error for invalid model type."""
+
+
 class InvalidParameterValueException(Exception):
     """Exception error class for invalid parameter values."""
 
diff --git a/eis_toolkit/prediction/machine_learning_predict.py b/eis_toolkit/prediction/machine_learning_predict.py
@@ -2,40 +2,62 @@
 import pandas as pd
 from beartype import beartype
 from beartype.typing import Tuple, Union
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, is_classifier
 from tensorflow import keras
 
+from eis_toolkit.exceptions import InvalidModelTypeException
+
 
 @beartype
 def predict_classifier(
-    data: Union[np.ndarray, pd.DataFrame], model: Union[BaseEstimator, keras.Model], include_probabilities: bool = True
+    data: Union[np.ndarray, pd.DataFrame],
+    model: Union[BaseEstimator, keras.Model],
+    classification_threshold: float = 0.5,
+    include_probabilities: bool = True,
 ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
     """
-    Predict with a trained model.
+    Predict with a trained classifier model.
 
     Args:
         data: Data used to make predictions.
         model: Trained classifier or regressor. Can be any machine learning model trained with
             EIS Toolkit (Sklearn and Keras models).
+        classification_threshold: Threshold for classifying based on probabilities. Only used for
+            binary classification. Defaults to 0.5.
         include_probabilities: If the probability array should be returned too. Defaults to True.
 
     Returns:
-        Predicted labels and optionally predicted probabilities by a classifier model.
+        Predicted labels and optionally predicted probabilities as one-dimensional arrays by a classifier model.
+
+    Raises:
+        InvalidModelTypeException: Input model is not a classifier model.
     """
     if isinstance(model, keras.Model):
-        probabilities = model.predict(data)
-        labels = probabilities.argmax(axis=-1)
+        probabilities = model.predict(data).astype(np.float32)
+        if probabilities.shape[1] == 1:  # Binary classification
+            probabilities = probabilities.squeeze()
+            labels = (probabilities >= classification_threshold).astype(np.float32)
+        else:  # Multiclass classification
+            labels = probabilities.argmax(axis=-1).astype(np.float32)
         if include_probabilities:
             return labels, probabilities
         else:
             return labels
     elif isinstance(model, BaseEstimator):
-        labels = model.predict(data)
+        if not is_classifier(model):
+            raise InvalidModelTypeException(f"Expected a classifier model: {type(model)}.")
+        probabilities = model.predict_proba(data).astype(np.float32)
+        if probabilities.shape[1] == 2:  # Binary classification
+            probabilities = probabilities[:, 1]
+            labels = (probabilities >= classification_threshold).astype(np.float32)
+        else:  # Multiclass classification
+            labels = probabilities.argmax(axis=-1).astype(np.float32)
         if include_probabilities:
-            probabilities = model.predict_proba(data)
             return labels, probabilities
         else:
             return labels
+    else:
+        raise InvalidModelTypeException(f"Model type not recognized: {type(model)}.")
 
 
 @beartype
@@ -44,7 +66,7 @@ def predict_regressor(
     model: Union[BaseEstimator, keras.Model],
 ) -> np.ndarray:
     """
-    Predict with a trained model.
+    Predict with a trained regressor model.
 
     Args:
         data: Data used to make predictions.
@@ -53,6 +75,11 @@ def predict_regressor(
 
     Returns:
         Regression model prediction array.
+
+    Raises:
+        InvalidModelTypeException: Input model is not a regressor model.
     """
+    if is_classifier(model):
+        raise InvalidModelTypeException(f"Expected a regressor model: {type(model)}.")
     result = model.predict(data)
     return result
diff --git a/tests/prediction/machine_learning_general_test.py b/tests/prediction/machine_learning_general_test.py
@@ -118,7 +118,7 @@ def test_evaluate_model_sklearn():
         X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42
     )
 
-    predictions = predict_classifier(X_test, model, include_probabilities=False)
+    predictions = predict_classifier(X_test, model, classification_threshold=0.5, include_probabilities=False)
     accuracy = score_predictions(y_test, predictions, "accuracy")
     np.testing.assert_equal(accuracy, 1.0)
 
@@ -131,7 +131,7 @@ def test_predict_classifier_sklearn():
         X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42
     )
 
-    predicted_labels, predicted_probabilities = predict_classifier(X_test, model, True)
+    predicted_labels, predicted_probabilities = predict_classifier(X_test, model, include_probabilities=True)
     np.testing.assert_equal(len(predicted_labels), len(y_test))
     np.testing.assert_equal(len(predicted_probabilities), len(y_test))
 

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ def test_evaluate_model_sklearn():`
`118`	`118`	`X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42`
`119`	`119`	`)`
`120`	`120`
`121`		`- predictions = predict_classifier(X_test, model, include_probabilities=False)`
	`121`	`+ predictions = predict_classifier(X_test, model, classification_threshold=0.5, include_probabilities=False)`
`122`	`122`	`accuracy = score_predictions(y_test, predictions, "accuracy")`
`123`	`123`	`np.testing.assert_equal(accuracy, 1.0)`
`124`	`124`
`@@ -131,7 +131,7 @@ def test_predict_classifier_sklearn():`
`131`	`131`	`X_train, y_train, model=RF_MODEL, validation_method="none", metrics=CLF_METRICS, random_state=42`
`132`	`132`	`)`
`133`	`133`
`134`		`- predicted_labels, predicted_probabilities = predict_classifier(X_test, model, True)`
	`134`	`+ predicted_labels, predicted_probabilities = predict_classifier(X_test, model, include_probabilities=True)`
`135`	`135`	`np.testing.assert_equal(len(predicted_labels), len(y_test))`
`136`	`136`	`np.testing.assert_equal(len(predicted_probabilities), len(y_test))`
`137`	`137`