Skip to content

Commit

Permalink
updating regressor interface for SRBENCH
Browse files Browse the repository at this point in the history
  • Loading branch information
gbomarito committed Sep 30, 2024
1 parent d62ffa9 commit c3498ce
Show file tree
Hide file tree
Showing 3 changed files with 400 additions and 31 deletions.
174 changes: 174 additions & 0 deletions bingo/symbolic_regression/srbench_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from .symbolic_regressor import SymbolicRegressor # , CrossValRegressor

# from sklearn.model_selection import KFold

hyper_params = [
# narrowed to 3 by looking at most commonly chosen
# among previous SRBench run
{"population_size": [100], "stack_size": [24]},
{"population_size": [500], "stack_size": [24]},
{"population_size": [2500], "stack_size": [32]},
]

"""
est: a sklearn-compatible regressor.
"""
est = SymbolicRegressor(
population_size=500,
stack_size=24,
operators=["+", "-", "*", "/", "sin", "cos", "exp", "log", "sqrt"],
use_simplification=True,
crossover_prob=0.3,
mutation_prob=0.45,
metric="mse",
parallel=False,
clo_alg="lm",
max_time=350,
max_evals=int(1e19),
evolutionary_algorithm="AgeFitnessEA",
clo_threshold=1.0e-5,
)


# N_FOLDS = 3

# # use `N_FOLDS` grid search cross-validation to select hyper parameters
# cv = KFold(n_splits=N_FOLDS, shuffle=True)

# est = CrossValRegressor(
# non_tuned_est,
# cv=cv,
# param_grid=hyper_params,
# verbose=3,
# n_jobs=1,
# scoring="r2",
# error_score="raise",
# )


def model(est, X=None):
"""
Return a sympy-compatible string of the final model.
Parameters
----------
est: sklearn regressor
The fitted model.
X: pd.DataFrame, default=None
The training data. This argument can be dropped if desired.
Returns
-------
A sympy-compatible string of the final model.
Notes
-----
Ensure that the variable names appearing in the model are identical to
those in the training data, `X`, which is a `pd.Dataframe`.
If your method names variables some other way, e.g. `[x_0 ... x_m]`,
you can specify a mapping in the `model` function such as:
```
def model(est, X):
mapping = {'x_'+str(i):k for i,k in enumerate(X.columns)}
new_model = est.model_
for k,v in mapping.items():
new_model = new_model.replace(k,v)
```
"""
model_str = str(est.get_best_individual())

try:
# replace X_# with data variables names
# have to iterate in reversed order to prevent X_1 in X_10 from being replaced first
for i, column in reversed(list(enumerate(X.columns))):
model_str = model_str.replace("X_" + str(i), column)
except AttributeError: # if X is not a pd.Dataframe
pass

model_str = model_str.replace(")(", ")*(").replace(
"^", "**"
) # replace operators for sympy
return model_str


def get_population(est):
"""
Return the final population of the model. This final population should
be a list with at most 100 individuals. Each of the individuals must
be compatible with scikit-learn, so they should have a predict method.
Also, it is expected that the `model()` function can operate with them,
so they should have a way of getting a simpy string representation.
Returns
-------
A list of scikit-learn compatible estimators
"""

return est.get_population()


def get_best_solution(est):
"""
Return the best solution from the final model.
Returns
-------
A scikit-learn compatible estimator
"""

return est.get_best_individual()


# # for calculating time per cross-validation run using
# # grid search, not halving grid search!
# def get_cv_time(total_time):
# return total_time / (
# len(hyper_params) * N_FOLDS + 1
# ) # have to train each hyperparam
# # set on `N_FOLDS` then also need to retrain final model


# def pre_train_fn(est, X, y):
# """set max_time in seconds based on length of X."""
# if len(X) <= 1000:
# max_time = get_cv_time(60 * 60 - 200) # 1 hour with 200 seconds of slack
# else:
# max_time = get_cv_time(
# 10 * 60 * 60 - 1000
# ) # 10 hours with 1000 seconds of slack
# est.set_max_time(new_max_time=max_time)


eval_kwargs = {
# "pre_train": pre_train_fn,
"test_params": {"test": True}
}

"""
eval_kwargs: a dictionary of variables passed to the evaluate_model()
function.
Allows one to configure aspects of the training process.
Options
-------
test_params: dict, default = None
Used primarily to shorten run-times during testing.
for running the tests. called as
est = est.set_params(**test_params)
max_train_samples:int, default = 0
if training size is larger than this, sample it.
if 0, use all training samples for fit.
scale_x: bool, default = True
Normalize the input data prior to fit.
scale_y: bool, default = True
Normalize the input label prior to fit.
pre_train: function, default = None
Adjust settings based on training data. Called prior to est.fit.
The function signature should be (est, X, y).
est: sklearn regressor; the fitted model.
X: pd.DataFrame; the training data.
y: training labels.
"""
Loading

0 comments on commit c3498ce

Please sign in to comment.