From c3498ce45c8bbf84532b929d99c59f27871cd93a Mon Sep 17 00:00:00 2001
From: "G. Bomarito" <geoffrey.f.bomarito@nasa.gov>
Date: Mon, 30 Sep 2024 17:42:30 -0400
Subject: [PATCH] updating regressor interface for SRBENCH

---
 .../symbolic_regression/srbench_interface.py  | 174 +++++++++++++++
 .../symbolic_regression/symbolic_regressor.py | 204 +++++++++++++++---
 examples/sklearn_wrapper.ipynb                |  53 ++++-
 3 files changed, 400 insertions(+), 31 deletions(-)
 create mode 100644 bingo/symbolic_regression/srbench_interface.py

diff --git a/bingo/symbolic_regression/srbench_interface.py b/bingo/symbolic_regression/srbench_interface.py
new file mode 100644
index 00000000..2d84db9a
--- /dev/null
+++ b/bingo/symbolic_regression/srbench_interface.py
@@ -0,0 +1,174 @@
+from .symbolic_regressor import SymbolicRegressor  # , CrossValRegressor
+
+# from sklearn.model_selection import KFold
+
+hyper_params = [
+    # narrowed to 3 by looking at most commonly chosen
+    # among previous SRBench run
+    {"population_size": [100], "stack_size": [24]},
+    {"population_size": [500], "stack_size": [24]},
+    {"population_size": [2500], "stack_size": [32]},
+]
+
+"""
+est: a sklearn-compatible regressor.
+"""
+est = SymbolicRegressor(
+    population_size=500,
+    stack_size=24,
+    operators=["+", "-", "*", "/", "sin", "cos", "exp", "log", "sqrt"],
+    use_simplification=True,
+    crossover_prob=0.3,
+    mutation_prob=0.45,
+    metric="mse",
+    parallel=False,
+    clo_alg="lm",
+    max_time=350,
+    max_evals=int(1e19),
+    evolutionary_algorithm="AgeFitnessEA",
+    clo_threshold=1.0e-5,
+)
+
+
+# N_FOLDS = 3
+
+# # use `N_FOLDS` grid search cross-validation to select hyper parameters
+# cv = KFold(n_splits=N_FOLDS, shuffle=True)
+
+# est = CrossValRegressor(
+#     non_tuned_est,
+#     cv=cv,
+#     param_grid=hyper_params,
+#     verbose=3,
+#     n_jobs=1,
+#     scoring="r2",
+#     error_score="raise",
+# )
+
+
+def model(est, X=None):
+    """
+    Return a sympy-compatible string of the final model.
+
+    Parameters
+    ----------
+    est: sklearn regressor
+        The fitted model.
+    X: pd.DataFrame, default=None
+        The training data. This argument can be dropped if desired.
+
+    Returns
+    -------
+    A sympy-compatible string of the final model.
+
+    Notes
+    -----
+
+    Ensure that the variable names appearing in the model are identical to
+    those in the training data, `X`, which is a `pd.Dataframe`.
+    If your method names variables some other way, e.g. `[x_0 ... x_m]`,
+    you can specify a mapping in the `model` function such as:
+
+        ```
+        def model(est, X):
+            mapping = {'x_'+str(i):k for i,k in enumerate(X.columns)}
+            new_model = est.model_
+            for k,v in mapping.items():
+                new_model = new_model.replace(k,v)
+        ```
+    """
+    model_str = str(est.get_best_individual())
+
+    try:
+        # replace X_# with data variables names
+        # have to iterate in reversed order to prevent X_1 in X_10 from being replaced first
+        for i, column in reversed(list(enumerate(X.columns))):
+            model_str = model_str.replace("X_" + str(i), column)
+    except AttributeError:  # if X is not a pd.Dataframe
+        pass
+
+    model_str = model_str.replace(")(", ")*(").replace(
+        "^", "**"
+    )  # replace operators for sympy
+    return model_str
+
+
+def get_population(est):
+    """
+    Return the final population of the model. This final population should
+    be a list with at most 100 individuals. Each of the individuals must
+    be compatible with scikit-learn, so they should have a predict method.
+
+    Also, it is expected that the `model()` function can operate with them,
+    so they should have a way of getting a simpy string representation.
+
+    Returns
+    -------
+    A list of scikit-learn compatible estimators
+    """
+
+    return est.get_population()
+
+
+def get_best_solution(est):
+    """
+    Return the best solution from the final model.
+
+    Returns
+    -------
+    A scikit-learn compatible estimator
+    """
+
+    return est.get_best_individual()
+
+
+# # for calculating time per cross-validation run using
+# # grid search, not halving grid search!
+# def get_cv_time(total_time):
+#     return total_time / (
+#         len(hyper_params) * N_FOLDS + 1
+#     )  # have to train each hyperparam
+#     # set on `N_FOLDS` then also need to retrain final model
+
+
+# def pre_train_fn(est, X, y):
+#     """set max_time in seconds based on length of X."""
+#     if len(X) <= 1000:
+#         max_time = get_cv_time(60 * 60 - 200)  # 1 hour with 200 seconds of slack
+#     else:
+#         max_time = get_cv_time(
+#             10 * 60 * 60 - 1000
+#         )  # 10 hours with 1000 seconds of slack
+#     est.set_max_time(new_max_time=max_time)
+
+
+eval_kwargs = {
+    # "pre_train": pre_train_fn,
+    "test_params": {"test": True}
+}
+
+"""
+eval_kwargs: a dictionary of variables passed to the evaluate_model()
+    function. 
+    Allows one to configure aspects of the training process.
+
+Options 
+-------
+    test_params: dict, default = None
+        Used primarily to shorten run-times during testing. 
+        for running the tests. called as 
+            est = est.set_params(**test_params)
+    max_train_samples:int, default = 0
+        if training size is larger than this, sample it. 
+        if 0, use all training samples for fit. 
+    scale_x: bool, default = True 
+        Normalize the input data prior to fit. 
+    scale_y: bool, default = True 
+        Normalize the input label prior to fit. 
+    pre_train: function, default = None
+        Adjust settings based on training data. Called prior to est.fit. 
+        The function signature should be (est, X, y). 
+            est: sklearn regressor; the fitted model. 
+            X: pd.DataFrame; the training data. 
+            y: training labels.
+"""
diff --git a/bingo/symbolic_regression/symbolic_regressor.py b/bingo/symbolic_regression/symbolic_regressor.py
index 7be06cd3..a3182a9a 100644
--- a/bingo/symbolic_regression/symbolic_regressor.py
+++ b/bingo/symbolic_regression/symbolic_regressor.py
@@ -42,6 +42,7 @@
     "GeneralizedCrowdingEA": GeneralizedCrowdingEA,
 }
 INF_REPLACEMENT = 1e100
+BEST_POP_MAX = 100
 
 
 # pylint: disable=too-many-instance-attributes, too-many-locals
@@ -244,19 +245,6 @@ def _get_archipelago(self, X, y, n_processes):
 
         return island
 
-    def _refit_best_individual(self, X, y, tol):
-        fit_func = self._get_local_opt(X, y, tol)
-        best_fitness = fit_func(self.best_ind)
-        best_constants = tuple(self.best_ind.constants)
-        for _ in range(5):
-            self.best_ind._needs_opt = True
-            fitness = fit_func(self.best_ind)
-            if fitness < best_fitness:
-                best_fitness = fitness
-                best_constants = tuple(self.best_ind.constants)
-        self.best_ind.fitness = best_fitness
-        self.best_ind.set_local_optimization_params(best_constants)
-
     def fit(self, X, y, sample_weight=None):
         """Fit this model to the given data.
 
@@ -269,6 +257,7 @@ def fit(self, X, y, sample_weight=None):
             Target/output values. M is the number of data points.
         sample_weight: Mx1 numpy array of numeric, optional
             Weights per sample/data point. M is the number of data points.
+            Not currently supported
 
         Returns
         -------
@@ -300,28 +289,100 @@ def fit(self, X, y, sample_weight=None):
             convergence_check_frequency=10,
         )
 
-        # most likely found sol in 0 gens
+        self.best_pop = self._find_best_population(X, y)
+        self.best_ind = min(self.best_pop, key=lambda x: x.fitness)
+
+        return self
+
+    def _find_best_population(self, X, y, max_pop=BEST_POP_MAX, pareto_only=False):
         if len(self.archipelago.hall_of_fame) == 0:
-            self.best_ind = self.archipelago.get_best_individual()
+            self.archipelago.update_hall_of_fame()
+
+        best_equs = []
+        if len(self.archipelago.hall_of_fame) <= max_pop:
+            for equ in self.archipelago.hall_of_fame:
+                best_equs.append(equ)
+
+            # TODO: this could be improved by only taking unique equations
+            for equ in np.random.choice(
+                self.archipelago.population,  # TODO: this will have to change if/when archipelago changes from an island/fpi
+                max_pop - len(best_equs),
+                replace=False,
+            ):
+                best_equs.append(equ)
         else:
-            self.best_ind = self.archipelago.hall_of_fame[0]
+            best_equs.append(self.archipelago.hall_of_fame[0])
+            best_equs.append(self.archipelago.hall_of_fame[1])
+            for equ in np.random.choice(
+                self.archipelago.hall_of_fame[1:-1], max_pop - 2, replace=False
+            ):
+                best_equs.append(equ)
 
-        self._refit_best_individual(X, y, tol=1e-6)
+        best_regressors = []
+        for equ in best_equs:
+            reg = EquationRegressor(equ, metric=self.metric, algo=self.clo_alg)
+            reg.fit(X, y)
+            best_regressors.append(reg)
 
-        return self
+        return best_regressors
 
     def get_best_individual(self):
         """Gets the best model found from fit().
 
         Returns
         -------
-        best_ind: `AGraph`
+        best_individual: `RegressorMixin`
             Model with the best fitness from fit().
+
+        Raises
+        ------
+        ValueError
+            If fit() has not been called yet
         """
         if self.best_ind is None:
-            raise ValueError("Best individual not set")
+            raise ValueError("Best individual not set. Make sure fit() was called.")
         return self.best_ind
 
+    def get_best_population(self):
+        """Gets best group of models from fit()
+
+        Returns
+        -------
+        list of `RegressorMixin`
+            Models from pareto front and final population from fit().
+
+        Raises
+        ------
+        ValueError
+            If fit() has not been called yet
+        """
+        if self.best_pop is None:
+            raise ValueError("Best population not set. Make sure fit() was called.")
+        return self.best_pop
+
+    def get_pareto_front(self):
+        """Gets best group of models from fit()
+
+        Returns
+        -------
+        list of `RegressorMixin`
+            Models with the best fitnesses and complexities from fit().
+
+        Raises
+        ------
+        ValueError
+            If fit() has not been called yet
+        """
+        if self.best_pop is None:
+            raise ValueError("Pareto front not set. Make sure fit() was called.")
+        hof = ParetoFront(
+            secondary_key=lambda equ: equ.complexity,
+            similarity_function=lambda x, y: x.fitness == y.fitness
+            and x.complexity == y.complexity,
+        )
+        hof.update(self.best_pop)
+        return [i for i in hof]
+
     def predict(self, X):
         """Use the best individual to predict the outputs of `X`.
 
@@ -337,10 +398,7 @@ def predict(self, X):
             Predicted target/output values. M is the number of data points.
         """
         best_ind = self.get_best_individual()
-        output = best_ind.evaluate_equation_at(X)
-
-        # convert nan to 0, inf to large number, and -inf to small number
-        return np.nan_to_num(output, posinf=INF_REPLACEMENT, neginf=-INF_REPLACEMENT)
+        return best_ind.predict(X)
 
 
 def agraph_similarity(ag_1, ag_2):
@@ -348,3 +406,101 @@ def agraph_similarity(ag_1, ag_2):
     return (
         ag_1.fitness == ag_2.fitness and ag_1.get_complexity() == ag_2.get_complexity()
     )
+
+
+class EquationRegressor(RegressorMixin, BaseEstimator):
+    """A thin scikit learn wrapper around bingo equations
+
+    Parameters
+    ----------
+    equation : `Equation`
+        equation that wiull be wrapped
+    metric : str, optional
+        metric used for local optimization on parameters during fit, by default "mse"
+    algo : str, optional
+        algorithm used for local optimization on parameters during fit, by default "lm"
+    tol : _type_, optional
+        tolerance used for local optimization on parameters during fit, by default 1e-6
+    fit_retries : int, optional
+        number of times to attempt to fit parameters. This is a hedge against the
+        variability of selecting a random starting point for the local optimization,
+        by default 5
+    """
+
+    def __init__(self, equation, metric="mse", algo="lm", tol=1e-6, fit_retries=5):
+        self.equation = equation
+        self.tol = tol
+        self.fit_retries = fit_retries
+        self.metric = metric
+        self.algo = algo
+        self.is_fitted_ = True
+
+    @property
+    def fitness(self):
+        """Fitness of equation"""
+        return self.equation.fitness
+
+    @property
+    def complexity(self):
+        """Complexity of equation"""
+        return self.equation.get_complexity()
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit constants in equation to the given data.
+
+        Parameters
+        ----------
+        X: MxD numpy array of numeric
+            Input values. D is the number of dimensions and
+            M is the number of data points.
+        y: Mx1 numpy array of numeric
+            Target/output values. M is the number of data points.
+        sample_weight: Mx1 numpy array of numeric, optional
+            Weights per sample/data point. M is the number of data points.
+            Not currently supported
+        """
+        if sample_weight is not None:
+            print("sample weight not None, TODO")
+            raise NotImplementedError
+
+        if self.equation.get_number_local_optimization_params() == 0:
+            return
+
+        fit_func = self._get_local_opt(X, y)
+        best_fitness = fit_func(self.equation)
+        best_constants = tuple(self.equation.constants)
+        for _ in range(self.fit_retries):
+            self.equation._needs_opt = True
+            fitness = fit_func(self.equation)
+            if fitness < best_fitness:
+                best_fitness = fitness
+                best_constants = tuple(self.equation.constants)
+        self.equation.fitness = best_fitness
+        self.equation.set_local_optimization_params(best_constants)
+
+    def _get_local_opt(self, X, y):
+        training_data = ExplicitTrainingData(X, y)
+        fitness = ExplicitRegression(training_data=training_data, metric=self.metric)
+        optimizer = ScipyOptimizer(fitness, method=self.algo, tol=self.tol)
+        local_opt_fitness = LocalOptFitnessFunction(fitness, optimizer)
+        return local_opt_fitness
+
+    def predict(self, X):
+        """Evaluate the equation to predict the outputs of `X`.
+
+        Parameters
+        ----------
+        X: MxD numpy array of numeric
+            Input values. D is the number of dimensions and
+            M is the number of data points.
+
+        Returns
+        -------
+        pred_y: Mx1 numpy array of numeric
+            Predicted target/output values. M is the number of data points.
+        """
+        output = self.equation.evaluate_equation_at(X)
+        return np.nan_to_num(output, posinf=INF_REPLACEMENT, neginf=-INF_REPLACEMENT)
+
+    def __str__(self):
+        return str(self.equation)
diff --git a/examples/sklearn_wrapper.ipynb b/examples/sklearn_wrapper.ipynb
index 47c7f938..0cd6e43d 100644
--- a/examples/sklearn_wrapper.ipynb
+++ b/examples/sklearn_wrapper.ipynb
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "d9639ff1",
    "metadata": {
     "scrolled": true
@@ -35,7 +35,8 @@
    "source": [
     "from bingo.symbolic_regression.symbolic_regressor import SymbolicRegressor\n",
     "regressor = SymbolicRegressor(population_size=100, stack_size=16,\n",
-    "                              use_simplification=True)"
+    "                              use_simplification=True,\n",
+    "                              max_time=20)"
    ]
   },
   {
@@ -50,12 +51,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 47,
    "id": "02f73359",
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
+    "np.random.seed(0)\n",
     "X_0 = np.linspace(-10, 10, num=30).reshape((-1, 1))\n",
     "X = np.array(X_0)\n",
     "y = 5.0 * X_0 ** 2 + 3.5 * X_0"
@@ -123,19 +125,19 @@
     "## Predicting Data with the Best Individual\n",
     "\n",
     "You can use the regressor's `.predict(X)` or\n",
-    "the best_individual's `.evaluate_equation_at(X)` to get\n",
+    "the best_individual's `.predict(X)` to get\n",
     "its predictions for `X`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 51,
    "id": "9a9c9105",
    "metadata": {},
    "outputs": [],
    "source": [
     "pred_y = regressor.predict(X)\n",
-    "pred_y = best_individual.evaluate_equation_at(X)"
+    "pred_y = best_individual.predict(X)"
    ]
   },
   {
@@ -152,6 +154,43 @@
     "plt.legend([\"Actual\", \"Predicted\"])\n",
     "plt.show()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21beab0e",
+   "metadata": {},
+   "source": [
+    "# Checking out the Pareto front\n",
+    "The regressor has a `get_pareto_front()` function that can be used to investigate the tradeoff of fitness and complexity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86fc88c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pareto_front = regressor.get_pareto_front()\n",
+    "plt.step([i.complexity for i in pareto_front], \n",
+    "         [max(i.fitness, 1e-20) for i in pareto_front], \n",
+    "         'o-')\n",
+    "for equ in pareto_front:\n",
+    "    plt.text(equ.complexity, \n",
+    "             (max(equ.fitness, 1e-20))*3, \n",
+    "             str(equ))\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Complexity\")\n",
+    "plt.ylabel(\"Fitness (MSE)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2c66546",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -170,7 +209,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,