From c3498ce45c8bbf84532b929d99c59f27871cd93a Mon Sep 17 00:00:00 2001 From: "G. Bomarito" Date: Mon, 30 Sep 2024 17:42:30 -0400 Subject: [PATCH] updating regressor interface for SRBENCH --- .../symbolic_regression/srbench_interface.py | 174 +++++++++++++++ .../symbolic_regression/symbolic_regressor.py | 204 +++++++++++++++--- examples/sklearn_wrapper.ipynb | 53 ++++- 3 files changed, 400 insertions(+), 31 deletions(-) create mode 100644 bingo/symbolic_regression/srbench_interface.py diff --git a/bingo/symbolic_regression/srbench_interface.py b/bingo/symbolic_regression/srbench_interface.py new file mode 100644 index 00000000..2d84db9a --- /dev/null +++ b/bingo/symbolic_regression/srbench_interface.py @@ -0,0 +1,174 @@ +from .symbolic_regressor import SymbolicRegressor # , CrossValRegressor + +# from sklearn.model_selection import KFold + +hyper_params = [ + # narrowed to 3 by looking at most commonly chosen + # among previous SRBench run + {"population_size": [100], "stack_size": [24]}, + {"population_size": [500], "stack_size": [24]}, + {"population_size": [2500], "stack_size": [32]}, +] + +""" +est: a sklearn-compatible regressor. +""" +est = SymbolicRegressor( + population_size=500, + stack_size=24, + operators=["+", "-", "*", "/", "sin", "cos", "exp", "log", "sqrt"], + use_simplification=True, + crossover_prob=0.3, + mutation_prob=0.45, + metric="mse", + parallel=False, + clo_alg="lm", + max_time=350, + max_evals=int(1e19), + evolutionary_algorithm="AgeFitnessEA", + clo_threshold=1.0e-5, +) + + +# N_FOLDS = 3 + +# # use `N_FOLDS` grid search cross-validation to select hyper parameters +# cv = KFold(n_splits=N_FOLDS, shuffle=True) + +# est = CrossValRegressor( +# non_tuned_est, +# cv=cv, +# param_grid=hyper_params, +# verbose=3, +# n_jobs=1, +# scoring="r2", +# error_score="raise", +# ) + + +def model(est, X=None): + """ + Return a sympy-compatible string of the final model. + + Parameters + ---------- + est: sklearn regressor + The fitted model. + X: pd.DataFrame, default=None + The training data. This argument can be dropped if desired. + + Returns + ------- + A sympy-compatible string of the final model. + + Notes + ----- + + Ensure that the variable names appearing in the model are identical to + those in the training data, `X`, which is a `pd.Dataframe`. + If your method names variables some other way, e.g. `[x_0 ... x_m]`, + you can specify a mapping in the `model` function such as: + + ``` + def model(est, X): + mapping = {'x_'+str(i):k for i,k in enumerate(X.columns)} + new_model = est.model_ + for k,v in mapping.items(): + new_model = new_model.replace(k,v) + ``` + """ + model_str = str(est.get_best_individual()) + + try: + # replace X_# with data variables names + # have to iterate in reversed order to prevent X_1 in X_10 from being replaced first + for i, column in reversed(list(enumerate(X.columns))): + model_str = model_str.replace("X_" + str(i), column) + except AttributeError: # if X is not a pd.Dataframe + pass + + model_str = model_str.replace(")(", ")*(").replace( + "^", "**" + ) # replace operators for sympy + return model_str + + +def get_population(est): + """ + Return the final population of the model. This final population should + be a list with at most 100 individuals. Each of the individuals must + be compatible with scikit-learn, so they should have a predict method. + + Also, it is expected that the `model()` function can operate with them, + so they should have a way of getting a simpy string representation. + + Returns + ------- + A list of scikit-learn compatible estimators + """ + + return est.get_population() + + +def get_best_solution(est): + """ + Return the best solution from the final model. + + Returns + ------- + A scikit-learn compatible estimator + """ + + return est.get_best_individual() + + +# # for calculating time per cross-validation run using +# # grid search, not halving grid search! +# def get_cv_time(total_time): +# return total_time / ( +# len(hyper_params) * N_FOLDS + 1 +# ) # have to train each hyperparam +# # set on `N_FOLDS` then also need to retrain final model + + +# def pre_train_fn(est, X, y): +# """set max_time in seconds based on length of X.""" +# if len(X) <= 1000: +# max_time = get_cv_time(60 * 60 - 200) # 1 hour with 200 seconds of slack +# else: +# max_time = get_cv_time( +# 10 * 60 * 60 - 1000 +# ) # 10 hours with 1000 seconds of slack +# est.set_max_time(new_max_time=max_time) + + +eval_kwargs = { + # "pre_train": pre_train_fn, + "test_params": {"test": True} +} + +""" +eval_kwargs: a dictionary of variables passed to the evaluate_model() + function. + Allows one to configure aspects of the training process. + +Options +------- + test_params: dict, default = None + Used primarily to shorten run-times during testing. + for running the tests. called as + est = est.set_params(**test_params) + max_train_samples:int, default = 0 + if training size is larger than this, sample it. + if 0, use all training samples for fit. + scale_x: bool, default = True + Normalize the input data prior to fit. + scale_y: bool, default = True + Normalize the input label prior to fit. + pre_train: function, default = None + Adjust settings based on training data. Called prior to est.fit. + The function signature should be (est, X, y). + est: sklearn regressor; the fitted model. + X: pd.DataFrame; the training data. + y: training labels. +""" diff --git a/bingo/symbolic_regression/symbolic_regressor.py b/bingo/symbolic_regression/symbolic_regressor.py index 7be06cd3..a3182a9a 100644 --- a/bingo/symbolic_regression/symbolic_regressor.py +++ b/bingo/symbolic_regression/symbolic_regressor.py @@ -42,6 +42,7 @@ "GeneralizedCrowdingEA": GeneralizedCrowdingEA, } INF_REPLACEMENT = 1e100 +BEST_POP_MAX = 100 # pylint: disable=too-many-instance-attributes, too-many-locals @@ -244,19 +245,6 @@ def _get_archipelago(self, X, y, n_processes): return island - def _refit_best_individual(self, X, y, tol): - fit_func = self._get_local_opt(X, y, tol) - best_fitness = fit_func(self.best_ind) - best_constants = tuple(self.best_ind.constants) - for _ in range(5): - self.best_ind._needs_opt = True - fitness = fit_func(self.best_ind) - if fitness < best_fitness: - best_fitness = fitness - best_constants = tuple(self.best_ind.constants) - self.best_ind.fitness = best_fitness - self.best_ind.set_local_optimization_params(best_constants) - def fit(self, X, y, sample_weight=None): """Fit this model to the given data. @@ -269,6 +257,7 @@ def fit(self, X, y, sample_weight=None): Target/output values. M is the number of data points. sample_weight: Mx1 numpy array of numeric, optional Weights per sample/data point. M is the number of data points. + Not currently supported Returns ------- @@ -300,28 +289,100 @@ def fit(self, X, y, sample_weight=None): convergence_check_frequency=10, ) - # most likely found sol in 0 gens + self.best_pop = self._find_best_population(X, y) + self.best_ind = min(self.best_pop, key=lambda x: x.fitness) + + return self + + def _find_best_population(self, X, y, max_pop=BEST_POP_MAX, pareto_only=False): if len(self.archipelago.hall_of_fame) == 0: - self.best_ind = self.archipelago.get_best_individual() + self.archipelago.update_hall_of_fame() + + best_equs = [] + if len(self.archipelago.hall_of_fame) <= max_pop: + for equ in self.archipelago.hall_of_fame: + best_equs.append(equ) + + # TODO: this could be improved by only taking unique equations + for equ in np.random.choice( + self.archipelago.population, # TODO: this will have to change if/when archipelago changes from an island/fpi + max_pop - len(best_equs), + replace=False, + ): + best_equs.append(equ) else: - self.best_ind = self.archipelago.hall_of_fame[0] + best_equs.append(self.archipelago.hall_of_fame[0]) + best_equs.append(self.archipelago.hall_of_fame[1]) + for equ in np.random.choice( + self.archipelago.hall_of_fame[1:-1], max_pop - 2, replace=False + ): + best_equs.append(equ) - self._refit_best_individual(X, y, tol=1e-6) + best_regressors = [] + for equ in best_equs: + reg = EquationRegressor(equ, metric=self.metric, algo=self.clo_alg) + reg.fit(X, y) + best_regressors.append(reg) - return self + return best_regressors def get_best_individual(self): """Gets the best model found from fit(). Returns ------- - best_ind: `AGraph` + best_individual: `RegressorMixin` Model with the best fitness from fit(). + + Raises + ------ + ValueError + If fit() has not been called yet """ if self.best_ind is None: - raise ValueError("Best individual not set") + raise ValueError("Best individual not set. Make sure fit() was called.") return self.best_ind + def get_best_population(self): + """Gets best group of models from fit() + + Returns + ------- + list of `RegressorMixin` + Models from pareto front and final population from fit(). + + Raises + ------ + ValueError + If fit() has not been called yet + """ + if self.best_pop is None: + raise ValueError("Best population not set. Make sure fit() was called.") + return self.best_pop + + def get_pareto_front(self): + """Gets best group of models from fit() + + Returns + ------- + list of `RegressorMixin` + Models with the best fitnesses and complexities from fit(). + + Raises + ------ + ValueError + If fit() has not been called yet + """ + if self.best_pop is None: + raise ValueError("Pareto front not set. Make sure fit() was called.") + hof = ParetoFront( + secondary_key=lambda equ: equ.complexity, + similarity_function=lambda x, y: x.fitness == y.fitness + and x.complexity == y.complexity, + ) + hof.update(self.best_pop) + return [i for i in hof] + def predict(self, X): """Use the best individual to predict the outputs of `X`. @@ -337,10 +398,7 @@ def predict(self, X): Predicted target/output values. M is the number of data points. """ best_ind = self.get_best_individual() - output = best_ind.evaluate_equation_at(X) - - # convert nan to 0, inf to large number, and -inf to small number - return np.nan_to_num(output, posinf=INF_REPLACEMENT, neginf=-INF_REPLACEMENT) + return best_ind.predict(X) def agraph_similarity(ag_1, ag_2): @@ -348,3 +406,101 @@ def agraph_similarity(ag_1, ag_2): return ( ag_1.fitness == ag_2.fitness and ag_1.get_complexity() == ag_2.get_complexity() ) + + +class EquationRegressor(RegressorMixin, BaseEstimator): + """A thin scikit learn wrapper around bingo equations + + Parameters + ---------- + equation : `Equation` + equation that wiull be wrapped + metric : str, optional + metric used for local optimization on parameters during fit, by default "mse" + algo : str, optional + algorithm used for local optimization on parameters during fit, by default "lm" + tol : _type_, optional + tolerance used for local optimization on parameters during fit, by default 1e-6 + fit_retries : int, optional + number of times to attempt to fit parameters. This is a hedge against the + variability of selecting a random starting point for the local optimization, + by default 5 + """ + + def __init__(self, equation, metric="mse", algo="lm", tol=1e-6, fit_retries=5): + self.equation = equation + self.tol = tol + self.fit_retries = fit_retries + self.metric = metric + self.algo = algo + self.is_fitted_ = True + + @property + def fitness(self): + """Fitness of equation""" + return self.equation.fitness + + @property + def complexity(self): + """Complexity of equation""" + return self.equation.get_complexity() + + def fit(self, X, y, sample_weight=None): + """Fit constants in equation to the given data. + + Parameters + ---------- + X: MxD numpy array of numeric + Input values. D is the number of dimensions and + M is the number of data points. + y: Mx1 numpy array of numeric + Target/output values. M is the number of data points. + sample_weight: Mx1 numpy array of numeric, optional + Weights per sample/data point. M is the number of data points. + Not currently supported + """ + if sample_weight is not None: + print("sample weight not None, TODO") + raise NotImplementedError + + if self.equation.get_number_local_optimization_params() == 0: + return + + fit_func = self._get_local_opt(X, y) + best_fitness = fit_func(self.equation) + best_constants = tuple(self.equation.constants) + for _ in range(self.fit_retries): + self.equation._needs_opt = True + fitness = fit_func(self.equation) + if fitness < best_fitness: + best_fitness = fitness + best_constants = tuple(self.equation.constants) + self.equation.fitness = best_fitness + self.equation.set_local_optimization_params(best_constants) + + def _get_local_opt(self, X, y): + training_data = ExplicitTrainingData(X, y) + fitness = ExplicitRegression(training_data=training_data, metric=self.metric) + optimizer = ScipyOptimizer(fitness, method=self.algo, tol=self.tol) + local_opt_fitness = LocalOptFitnessFunction(fitness, optimizer) + return local_opt_fitness + + def predict(self, X): + """Evaluate the equation to predict the outputs of `X`. + + Parameters + ---------- + X: MxD numpy array of numeric + Input values. D is the number of dimensions and + M is the number of data points. + + Returns + ------- + pred_y: Mx1 numpy array of numeric + Predicted target/output values. M is the number of data points. + """ + output = self.equation.evaluate_equation_at(X) + return np.nan_to_num(output, posinf=INF_REPLACEMENT, neginf=-INF_REPLACEMENT) + + def __str__(self): + return str(self.equation) diff --git a/examples/sklearn_wrapper.ipynb b/examples/sklearn_wrapper.ipynb index 47c7f938..0cd6e43d 100644 --- a/examples/sklearn_wrapper.ipynb +++ b/examples/sklearn_wrapper.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d9639ff1", "metadata": { "scrolled": true @@ -35,7 +35,8 @@ "source": [ "from bingo.symbolic_regression.symbolic_regressor import SymbolicRegressor\n", "regressor = SymbolicRegressor(population_size=100, stack_size=16,\n", - " use_simplification=True)" + " use_simplification=True,\n", + " max_time=20)" ] }, { @@ -50,12 +51,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "02f73359", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", + "np.random.seed(0)\n", "X_0 = np.linspace(-10, 10, num=30).reshape((-1, 1))\n", "X = np.array(X_0)\n", "y = 5.0 * X_0 ** 2 + 3.5 * X_0" @@ -123,19 +125,19 @@ "## Predicting Data with the Best Individual\n", "\n", "You can use the regressor's `.predict(X)` or\n", - "the best_individual's `.evaluate_equation_at(X)` to get\n", + "the best_individual's `.predict(X)` to get\n", "its predictions for `X`." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "9a9c9105", "metadata": {}, "outputs": [], "source": [ "pred_y = regressor.predict(X)\n", - "pred_y = best_individual.evaluate_equation_at(X)" + "pred_y = best_individual.predict(X)" ] }, { @@ -152,6 +154,43 @@ "plt.legend([\"Actual\", \"Predicted\"])\n", "plt.show()" ] + }, + { + "cell_type": "markdown", + "id": "21beab0e", + "metadata": {}, + "source": [ + "# Checking out the Pareto front\n", + "The regressor has a `get_pareto_front()` function that can be used to investigate the tradeoff of fitness and complexity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86fc88c2", + "metadata": {}, + "outputs": [], + "source": [ + "pareto_front = regressor.get_pareto_front()\n", + "plt.step([i.complexity for i in pareto_front], \n", + " [max(i.fitness, 1e-20) for i in pareto_front], \n", + " 'o-')\n", + "for equ in pareto_front:\n", + " plt.text(equ.complexity, \n", + " (max(equ.fitness, 1e-20))*3, \n", + " str(equ))\n", + "plt.yscale(\"log\")\n", + "plt.xlabel(\"Complexity\")\n", + "plt.ylabel(\"Fitness (MSE)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2c66546", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -170,7 +209,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.5" + "version": "3.12.2" } }, "nbformat": 4,