Skip to content

Commit

Permalink
cleaning a lot of comments and TODOs
Browse files Browse the repository at this point in the history
  • Loading branch information
gAldeia committed Jun 5, 2024
1 parent 4381134 commit cd9436c
Show file tree
Hide file tree
Showing 39 changed files with 358 additions and 1,317 deletions.
207 changes: 17 additions & 190 deletions pybrush/BrushEstimator.py
Original file line number Diff line number Diff line change
@@ -1,111 +1,27 @@
"""
sklearn-compatible wrapper for GP analyses.
TODO: update this docstring
See brushgp.cpp for Python (via pybind11) modules that give more fine-grained
See engine.cpp for Python (via pybind11) modules that give more fine-grained
control of the underlying GP objects.
"""
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
from sklearn.utils.validation import check_is_fitted
# from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd

from _brush.individual import * # RegressorIndividual, ClassifierIndividual, MultiClassifierIndividual
from _brush.engine import * # Regressor, Classifier, and MultiClassifier engines
from pybrush import Parameters, Dataset, SearchSpace
from pybrush import brush_rng
from sklearn.base import BaseEstimator, ClassifierMixin, \
RegressorMixin, TransformerMixin

from sklearn.utils.validation import check_is_fitted

from pybrush import Parameters, Dataset, SearchSpace, brush_rng
from pybrush.EstimatorInterface import EstimatorInterface
from pybrush import RegressorEngine, ClassifierEngine, MultiClassifierEngine

class BrushEstimator(BaseEstimator):
class BrushEstimator(EstimatorInterface, BaseEstimator):
"""
This is the base class for Deap-based Brush estimators.
This class shouldn't be called directly; instead, call a child class like
:py:class:`DeapRegressor <brush.estimator.DeapRegressor>` or :py:class:`DeapClassifier <brush.estimator.DeapClassifier>`.
All of the shared parameters are documented here.
Parameters
----------
mode : str, default 'classification'
The mode of the estimator. Used by subclasses
pop_size : int, default 100
Population size.
gens : int, default 100
Maximum iterations of the algorithm.
max_time: int, optional (default: -1)
Maximum time terminational criterion in seconds. If -1, not used.
max_stall: int, optional (default: 0)
How many generations to continue after the validation loss has
stalled. If 0, not used.
verbosity : int, default 0
Controls level of printouts.
max_depth : int, default 0
Maximum depth of GP trees in the GP program. Use 0 for no limit.
max_size : int, default 0
Maximum number of nodes in a tree. Use 0 for no limit.
num_islands : int, default 5
Number of independent islands to use in evolutionary framework.
Ignored if `algorithm!="nsga2island"`.
mig_prob : float, default 0.05
Probability of occuring a migration between two random islands at the
end of a generation, must be between 0 and 1.
cx_prob : float, default 1/7
Probability of applying the crossover variation when generating the offspring,
must be between 0 and 1.
Given that there are `n` mutations, and either crossover or mutation is
used to generate each individual in the offspring (but not both at the
same time), we want to have by default an uniform probability between
crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and
`1/n` for each mutation, we can achieve an uniform distribution.
mutation_probs : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}
A dictionary with keys naming the types of mutation and floating point
values specifying the fraction of total mutations to do with that method.
The probability of having a mutation is `(1-cx_prob)` and, in case the mutation
is applied, then each mutation option is sampled based on the probabilities
defined in `mutation_probs`. The set of probabilities should add up to 1.0.
functions: dict[str,float] or list[str], default {}
A dictionary with keys naming the function set and values giving the probability
of sampling them, or a list of functions which will be weighted uniformly.
If empty, all available functions are included in the search space.
initialization : {"uniform", "max_size"}, default "uniform"
Distribution of sizes on the initial population. If `max_size`, then every
expression is created with `max_size` nodes. If `uniform`, size will be
uniformly distributed between 1 and `max_size`.
objectives : list[str], default ["error", "size"]
list with one or more objectives to use. Options are `"error", "size", "complexity"`.
If `"error"` is used, then it will be the mean squared error for regression,
and accuracy for classification.
algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2"
Which Evolutionary Algorithm framework to use to evolve the population.
weights_init : bool, default True
Whether the search space should initialize the sampling weights of terminal nodes
based on the correlation with the output y. If `False`, then all terminal nodes
will have the same probability of 1.0.
validation_size : float, default 0.0
Percentage of samples to use as a hold-out partition. These samples are used
to calculate statistics during evolution, but not used to train the models.
The `best_estimator_` will be selected using this partition. If zero, then
the same data used for training is used for validation.
val_from_arch: boolean, optional (default: True)
Validates the final model using the archive rather than the whole
population.
use_arch: boolean, optional (default: False)
Determines if we should save pareto front of the entire evolution
(when set to True) or just the final population (False).
batch_size : float, default 1.0
Percentage of training data to sample every generation. If `1.0`, then
all data is used. Very small values can improve execution time, but
also lead to underfit.
logfile: str, optional (default: "")
If specified, spits statistics into a logfile. "" means don't log.
random_state: int or None, default None
If int, then the value is used to seed the c++ random generator; if None,
then a seed will be generated using a non-deterministic generator. It is
important to notice that, even if the random state is fixed, it is
unlikely that running brush using multiple threads will have the same
results. This happens because the Operating System's scheduler is
responsible to choose which thread will run at any given time, thus
reproductibility is not guaranteed.
This is the base class for Brush estimators using the c++ engine.
Parameters are defined and documented in pybrush.EstimatorInterface.
Attributes
----------
Expand All @@ -125,59 +41,8 @@ class BrushEstimator(BaseEstimator):
The toolbox used by DEAP for EA algorithm.
"""

def __init__(
self,
mode='classification',
pop_size=100,
gens=100,
max_time=-1,
max_stall=0,
verbosity=0,
max_depth=3,
max_size=20,
num_islands=1,
n_jobs=1,
mig_prob=0.05,
cx_prob= 1/7,
mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6,
"toggle_weight_on":1/6, "toggle_weight_off":1/6},
functions: list[str]|dict[str,float] = {},
initialization="uniform",
algorithm="nsga2",
objectives=["error", "size"],
random_state=None,
logfile="",
weights_init=True,
val_from_arch=True,
use_arch=False,
validation_size: float = 0.0,
batch_size: float = 1.0
):

self.pop_size=pop_size
self.gens=gens
self.max_stall=max_stall
self.max_time=max_time
self.verbosity=verbosity
self.algorithm=algorithm
self.mode=mode
self.max_depth=max_depth
self.max_size=max_size
self.num_islands=num_islands
self.mig_prob=mig_prob
self.n_jobs=n_jobs
self.cx_prob=cx_prob
self.logfile=logfile
self.mutation_probs=mutation_probs
self.val_from_arch=val_from_arch # TODO: val from arch implementation (in cpp side)
self.use_arch=use_arch
self.functions=functions
self.objectives=objectives
self.initialization=initialization
self.random_state=random_state
self.batch_size=batch_size
self.weights_init=weights_init
self.validation_size=validation_size
def __init__(self, **kwargs):
EstimatorInterface.__init__(self, **kwargs)

def fit(self, X, y):
"""
Expand Down Expand Up @@ -217,45 +82,7 @@ def fit(self, X, y):

self.search_space_ = SearchSpace(self.data_, self.functions_, self.weights_init)

self.parameters_ = Parameters()
self.parameters_.classification = self.mode == "classification"
self.parameters_.n_classes = self.n_classes_
self.parameters_.verbosity = self.verbosity
self.parameters_.n_jobs = self.n_jobs
self.parameters_.pop_size = self.pop_size
self.parameters_.gens = self.gens
self.parameters_.logfile = self.logfile
self.parameters_.max_stall = self.max_stall
self.parameters_.max_time = self.max_time
self.parameters_.num_islands = self.num_islands
self.parameters_.max_depth = self.max_depth
self.parameters_.max_size = self.max_size
self.parameters_.objectives = self.objectives
self.parameters_.cx_prob = self.cx_prob
self.parameters_.use_arch = self.use_arch
self.parameters_.val_from_arch = self.val_from_arch
self.parameters_.mig_prob = self.mig_prob
self.parameters_.functions = self.functions
self.parameters_.mutation_probs = self.mutation_probs
self.parameters_.validation_size = self.validation_size
self.parameters_.batch_size = self.batch_size
self.parameters_.feature_names = self.feature_names_

self.parameters_.scorer_ = "mse"
if self.mode == "classification":
self.parameters_.scorer_ = "log" if self.n_classes_ == 2 else "multi_log"

if self.random_state is not None:
seed = 0
if isinstance(self.random_state, np.random.Generator):
seed = self.random_state.integers(10000)
elif isinstance(self.random_state, int):
seed = self.random_state
else:
raise ValueError("random_state must be either a numpy random generator or an integer")

self.parameters_.random_state = seed

self.parameters_ = self._wrap_parameters()
self.engine_ = None
if self.mode == 'classification':
self.engine_ = ( ClassifierEngine
Expand Down Expand Up @@ -344,7 +171,7 @@ def predict_archive(self, X):
return preds


class BrushClassifier(BrushEstimator,ClassifierMixin):
class BrushClassifier(BrushEstimator, ClassifierMixin):
"""Deap-based Brush for classification.
For options, see :py:class:`DeapEstimator <brush.estimator.DeapEstimator>`.
Expand Down
Loading

0 comments on commit cd9436c

Please sign in to comment.