Working on making the regressor compliant with sklearn specs.

Alessandro Lucantonio · Alessandro Lucantonio · commit 5e332cf61b35 · 2025-03-10T16:39:33.000+01:00
diff --git a/src/alpine/gp/regressor.py b/src/alpine/gp/regressor.py
@@ -104,7 +104,7 @@ def __init__(
         output_path: str | None = None,
         batch_size=1,
     ):
-
+        super().__init__()
         self.pset = pset
 
         self.fitness = fitness
@@ -122,16 +122,14 @@ def __init__(
         self.num_best_inds_str = num_best_inds_str
         self.plot_freq = plot_freq
         self.preprocess_func = preprocess_func
-        self.callback_fun = callback_func
+        self.callback_func = callback_func
         self.is_plot_best_individual_tree = plot_best_individual_tree
         self.is_save_best_individual = save_best_individual
         self.is_save_train_fit_history = save_train_fit_history
         self.output_path = output_path
         self.batch_size = batch_size
 
-        if common_data is not None:
-            # FIXME: does everything work when the functions do not have common args?
-            self.__store_fit_error_common_args(common_data)
+        self.common_data = common_data
 
         self.NINDIVIDUALS = NINDIVIDUALS
         self.NGEN = NGEN
@@ -157,8 +155,14 @@ def __init__(
         self.overlapping_generation = overlapping_generation
         self.validate = validate
 
+        self.frac_elitist = frac_elitist
+
         # Elitism settings
-        self.n_elitist = int(frac_elitist * self.NINDIVIDUALS)
+        self.n_elitist = int(self.frac_elitist * self.NINDIVIDUALS)
+
+        if self.common_data is not None:
+            # FIXME: does everything work when the functions do not have common args?
+            self.__store_fit_error_common_args(self.common_data)
 
         # config individual creator and toolbox
         self.__creator_toolbox_config()
@@ -196,6 +200,9 @@ def __init__(
         self.plot_initialized = False
         self.fig_id = 0
 
+    def get_params(self, deep=True):
+        return self.__dict__
+
     def __creator_toolbox_config(self):
         """Initialize toolbox and individual creator based on config file."""
         self.toolbox = base.Toolbox()
@@ -276,7 +283,8 @@ def __store_datasets(self, datasets: Dict[str, Dataset]):
     def __store_shared_objects(self, label: str, data: Dict):
         for key, value in data.items():
             # replace each item of the dataset with its obj ref
-            data[key] = ray.put(value)
+            if not isinstance(value, ray.ObjectRef):
+                data[key] = ray.put(value)
         self.data_store[label] = data
 
     def __init_logbook(self):
@@ -425,6 +433,7 @@ def fit(self, X_train, y_train=None, X_val=None, y_val=None):
         if self.validate and self.error_metric is not None:
             self.__register_val_funcs()
         self.__run()
+        return self
 
     def predict(self, X_test):
         test_data = {"X": X_test}
@@ -567,8 +576,8 @@ def __evolve_islands(self, cgen: int):
         fitnesses = self.__unflatten_list(fitnesses, [len(i) for i in invalid_inds])
 
         for i in range(self.num_islands):
-            if self.callback_fun is not None:
-                self.callback_fun(invalid_inds[i], fitnesses[i])
+            if self.callback_func is not None:
+                self.callback_func(invalid_inds[i], fitnesses[i])
             else:
                 for ind, fit in zip(invalid_inds[i], fitnesses[i]):
                     ind.fitness.values = fit
@@ -626,8 +635,8 @@ def __run(self):
         for i in range(self.num_islands):
             fitnesses = self.toolbox.map(self.toolbox.evaluate_train, self.pop[i])
 
-            if self.callback_fun is not None:
-                self.callback_fun(self.pop[i], fitnesses)
+            if self.callback_func is not None:
+                self.callback_func(self.pop[i], fitnesses)
             else:
                 for ind, fit in zip(self.pop[i], fitnesses):
                     ind.fitness.values = fit
diff --git a/tests/test_regressor.py b/tests/test_regressor.py
@@ -0,0 +1,83 @@
+from sklearn.utils.estimator_checks import check_estimator
+from alpine.gp.regressor import GPSymbolicRegressor
+from alpine.gp import util
+from deap import gp
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split, GridSearchCV
+
+
+def test_regressor():
+    pset = gp.PrimitiveSetTyped(
+        "MAIN",
+        [
+            float,
+        ],
+        float,
+    )
+    pset.renameArguments(ARG0="x")
+
+    primitives = {
+        "imports": {"alpine.gp.numpy_primitives": ["numpy_primitives"]},
+        "used": [
+            {"name": "add", "dimension": None, "rank": None},
+            {"name": "sub", "dimension": None, "rank": None},
+            {"name": "mul", "dimension": None, "rank": None},
+            {"name": "div", "dimension": None, "rank": None},
+            {"name": "sin", "dimension": None, "rank": None},
+            {"name": "cos", "dimension": None, "rank": None},
+            {"name": "exp", "dimension": None, "rank": None},
+            {"name": "log", "dimension": None, "rank": None},
+        ],
+    }
+
+    pset = util.add_primitives_to_pset_from_dict(pset, primitives)
+
+    penalty = {"reg_param": 0.0}
+    common_data = {"penalty": penalty}
+
+    gpsr = GPSymbolicRegressor(
+        pset=pset,
+        fitness=None,
+        error_metric=None,
+        predict_func=None,
+        common_data=common_data,
+        NINDIVIDUALS=100,
+        num_islands=10,
+        NGEN=200,
+        MUTPB=0.1,
+        min_height=2,
+        max_height=6,
+        crossover_prob=0.9,
+        overlapping_generation=True,
+        print_log=True,
+        batch_size=100,
+    )
+
+    print(gpsr.get_params())
+    check_estimator(gpsr)
+
+    # # Generate synthetic data
+    # X, y = make_regression(n_samples=100, n_features=10, random_state=42)
+    # X_train, X_test, y_train, y_test = train_test_split(
+    #     X, y, test_size=0.2, random_state=42
+    # )
+
+    # # Parameter grid
+    # param_grid = {"NGEN": [10, 20]}
+
+    # # Grid search
+    # grid_search = GridSearchCV(
+    #     estimator=gpsr,
+    #     param_grid=param_grid,
+    #     cv=3,
+    #     scoring="r2",
+    #     verbose=1,
+    #     n_jobs=1,
+    # )
+
+    # # Fit the grid search
+    # grid_search.fit(X_train, y_train)
+
+
+if __name__ == "__main__":
+    test_regressor()