Skip to content

Commit

Permalink
Archive implementation. Individual ids. New TODOs to solve
Browse files Browse the repository at this point in the history
  • Loading branch information
gAldeia committed May 3, 2024
1 parent fe84d8d commit 12a8772
Show file tree
Hide file tree
Showing 15 changed files with 301 additions and 57 deletions.
58 changes: 57 additions & 1 deletion pybrush/BrushEstimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ class BrushEstimator(BaseEstimator):
val_from_arch: boolean, optional (default: True)
Validates the final model using the archive rather than the whole
population.
use_arch: boolean, optional (default: False)
Determines if we should save pareto front of the entire evolution
(when set to True) or just the final population (False).
batch_size : float, default 1.0
Percentage of training data to sample every generation. If `1.0`, then
all data is used. Very small values can improve execution time, but
Expand Down Expand Up @@ -146,6 +149,7 @@ def __init__(
logfile="",
weights_init=True,
val_from_arch=True,
use_arch=False,
validation_size: float = 0.0,
batch_size: float = 1.0
):
Expand All @@ -165,7 +169,8 @@ def __init__(
self.cx_prob=cx_prob
self.logfile=logfile
self.mutation_probs=mutation_probs
self.val_from_arch=val_from_arch # TODO: val from arch
self.val_from_arch=val_from_arch # TODO: val from arch implementation (in cpp side)
self.use_arch=use_arch
self.functions=functions
self.objectives=objectives
self.initialization=initialization
Expand Down Expand Up @@ -227,6 +232,8 @@ def fit(self, X, y):
self.parameters_.max_size = self.max_size
self.parameters_.objectives = self.objectives
self.parameters_.cx_prob = self.cx_prob
self.parameters_.use_arch = self.use_arch
self.parameters_.val_from_arch = self.val_from_arch
self.parameters_.mig_prob = self.mig_prob
self.parameters_.functions = self.functions
self.parameters_.mutation_probs = self.mutation_probs
Expand Down Expand Up @@ -312,6 +319,30 @@ def get_params(self, deep=True):
out[key] = value
return out

def predict_archive(self, X):
"""Returns a list of dictionary predictions for all models."""
check_is_fitted(self)

if isinstance(X, pd.DataFrame):
X = X.values

assert isinstance(X, np.ndarray)

data = Dataset(X=X, ref_dataset=self.data_, c=self.mode == "classification",
feature_names=self.feature_names_)

archive = self.engine_.get_archive()

preds = []
for ind in archive:
tmp = {
'id' : ind['id'],
'y_pred' : self.engine_.predict_archive(ind['id'], data)
}
preds.append(tmp)

return preds


class BrushClassifier(BrushEstimator,ClassifierMixin):
"""Deap-based Brush for classification.
Expand Down Expand Up @@ -368,6 +399,31 @@ def predict_proba(self, X):
prob[:, 0] -= prob[:, 1]

return prob


def predict_archive(self, X):
"""Returns a list of dictionary predictions for all models."""
check_is_fitted(self)

if isinstance(X, pd.DataFrame):
X = X.values

assert isinstance(X, np.ndarray)

data = Dataset(X=X, ref_dataset=self.data_, c=True,
feature_names=self.feature_names_)

archive = self.engine_.get_archive()

preds = []
for ind in archive:
tmp = {
'id' : ind['id'],
'y_pred' : self.engine_.predict_proba_archive(ind['id'], data)
}
preds.append(tmp)

return preds


class BrushRegressor(BrushEstimator, RegressorMixin):
Expand Down
14 changes: 14 additions & 0 deletions src/bindings/bind_engines.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ void bind_engine(py::module& m, string name)
.def("predict",
static_cast<RetType (T::*)(const Ref<const ArrayXXf> &X)>(&T::predict),
"predict from X data")
.def("predict_archive",
static_cast<RetType (T::*)(int id, const Dataset &d)>(&T::predict_archive),
"predict from individual in archive")
.def("predict_archive",
static_cast<RetType (T::*)(int id, const Ref<const ArrayXXf> &X)>(&T::predict_archive),
"predict from individual in archive")
.def("get_archive", &T::get_archive, py::arg("front") = false)
.def(py::pickle(
[](const T &p) { // __getstate__
/* Return a tuple that fully encodes the state of the object */
Expand All @@ -82,6 +89,13 @@ void bind_engine(py::module& m, string name)
.def("predict_proba",
static_cast<ArrayXf (T::*)(const Ref<const ArrayXXf> &X)>(&T::predict_proba),
"predict from X data")
.def("predict_proba_archive",
static_cast<ArrayXf (T::*)(int id, const Dataset &d)>(&T::predict_proba_archive),
"predict from individual in archive")
.def("predict_proba_archive",
static_cast<ArrayXf (T::*)(int id, const Ref<const ArrayXXf> &X)>(&T::predict_proba_archive),
"predict from individual in archive")

;
}
}
1 change: 1 addition & 0 deletions src/bindings/bind_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ void bind_params(py::module& m)
.def_property("save_population", &Brush::Parameters::get_save_population, &Brush::Parameters::set_save_population)
.def_property("logfile", &Brush::Parameters::get_logfile, &Brush::Parameters::set_logfile)
.def_property("num_islands", &Brush::Parameters::get_num_islands, &Brush::Parameters::set_num_islands)
.def_property("use_arch", &Brush::Parameters::get_use_arch, &Brush::Parameters::set_use_arch)
.def_property("n_classes", &Brush::Parameters::get_n_classes, &Brush::Parameters::set_n_classes)
.def_property("n_jobs", &Brush::Parameters::get_n_jobs, &Brush::Parameters::set_n_classes)
.def_property("classification", &Brush::Parameters::get_classification, &Brush::Parameters::set_classification)
Expand Down
2 changes: 1 addition & 1 deletion src/bindings/bind_variation.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void bind_variation(py::module& m, string name)
// including offspring indexes (the vary method will store the offspring in the second half of the index vector)
pop.add_offspring_indexes(island);

self.vary(pop, island, parents);
self.vary(pop, island, parents, params);

// making copies of the second half of the island individuals
vector<size_t> idxs = pop.get_island_indexes(island);
Expand Down
119 changes: 114 additions & 5 deletions src/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,97 @@ void Engine<T>::print_stats(std::ofstream& log, float fraction)
<<"\n\n";
}

template <ProgramType T>
vector<json> Engine<T>::get_archive(bool front)
{
json j; // TODO: use this front argument (or remove it). I think I can remove
for (const auto& ind : archive.individuals) {
to_json(j, ind); // Serialize each individual
}
return j;
}

// TODO: private function called find_individual that searches for it based on id. Then,
// use this function in predict_archive and predict_proba_archive.
template <ProgramType T>
auto Engine<T>::predict_archive(int id, const Dataset& data)
{
if (id == best_ind.id)
return best_ind.predict(data);

for (int i = 0; i < this->archive.individuals.size(); ++i)
{
Individual<T>& ind = this->archive.individuals.at(i);

if (id == ind.id)
return ind.predict(data);
}
for (int island=0; island<pop.num_islands; ++island) {
auto idxs = pop.get_island_indexes(island);

for (unsigned i = 0; i<idxs.size(); ++i)
{
const auto& ind = pop.individuals.at(idxs.at(i));

if (id == ind->id)
return ind->predict(data);
}
}

std::runtime_error("Could not find id = "
+ to_string(id) + "in archive or population.");

return best_ind.predict(data);
}

template <ProgramType T>
auto Engine<T>::predict_archive(int id, const Ref<const ArrayXXf>& X)
{
Dataset d(X);
return predict_archive(id, d);
}

template <ProgramType T>
template <ProgramType P>
requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier))
auto Engine<T>::predict_proba_archive(int id, const Dataset& data)
{
if (id == best_ind.id)
return best_ind.predict_proba(data);

for (int i = 0; i < this->archive.individuals.size(); ++i)
{
Individual<T>& ind = this->archive.individuals.at(i);

if (id == ind.id)
return ind.predict_proba(data);
}
for (int island=0; island<pop.num_islands; ++island) {
auto idxs = pop.get_island_indexes(island);

for (unsigned i = 0; i<idxs.size(); ++i)
{
const auto& ind = pop.individuals.at(idxs.at(i));

if (id == ind->id)
return ind->predict_proba(data);
}
}

std::runtime_error("Could not find id = "
+ to_string(id) + "in archive or population.");

return best_ind.predict_proba(data);
}

template <ProgramType T>
template <ProgramType P>
requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier))
auto Engine<T>::predict_proba_archive(int id, const Ref<const ArrayXXf>& X)
{
Dataset d(X);
return predict_proba_archive(id, d);
}

template <ProgramType T> // TODO: use the dataset, or ignore it
bool Engine<T>::update_best(const Dataset& data, bool val)
Expand Down Expand Up @@ -313,6 +404,7 @@ void Engine<T>::run(Dataset &data)
unsigned generation = 0;
unsigned stall_count = 0;
float fraction = 0;
bool use_arch;

auto stop = [&]() {
return ( (generation == params.gens)
Expand Down Expand Up @@ -402,7 +494,7 @@ void Engine<T>::run(Dataset &data)

//std::cout << "before vary" << std::endl;
// // variation to produce offspring
variator.vary(this->pop, island, island_parents.at(island));
variator.vary(this->pop, island, island_parents.at(island), params);
//std::cout << "before update fitness" << std::endl;

evaluator.update_fitness(this->pop, island, data, params, true);
Expand Down Expand Up @@ -442,16 +534,16 @@ void Engine<T>::run(Dataset &data)
auto finish_gen = subflow.emplace([&]() {
bool updated_best = this->update_best(data);

// TODO: use_arch
if ( params.verbosity>1 || !params.logfile.empty()) {
if ( (params.verbosity>1 || !params.logfile.empty() )
|| params.use_arch ) {
calculate_stats();
}

// TODO: logger working
// logger.log("calculate stats...",2);

// if (use_arch) // TODO: archive
// archive.update(pop,params);
if (params.use_arch)
archive.update(pop, params);

fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.gens :
timer.Elapsed().count()/params.max_time;
Expand Down Expand Up @@ -498,6 +590,23 @@ void Engine<T>::run(Dataset &data)
// TODO: open, write, close? (to avoid breaking the file and allow some debugging if things dont work well)
if (log.is_open())
log.close();

// if we're not using an archive, let's store the final population in the
// archive
if (!params.use_arch)
{
archive.individuals.resize(0);
for (int island =0; island< pop.num_islands; ++island) {
// cout << "island" << island << endl;
vector<size_t> idxs = pop.get_island_indexes(island);

for (unsigned i = 0; i<idxs.size(); ++i)
{
archive.individuals.push_back( *pop.individuals.at(idxs.at(i)) );
// cout << "index" << i << endl;
}
}
}

} // work done, report last gen and stop
); // evolutionary loop
Expand Down
33 changes: 21 additions & 12 deletions src/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class Engine{

// TODO: starting pop (just like feat)

// TODO: make thesqe work
// TODO: make these work
// /// predict on unseen data.
// VectorXf predict(MatrixXf& X, LongData& Z);
// VectorXf predict(MatrixXf& X);
Expand All @@ -105,24 +105,34 @@ class Engine{

// archive stuff

// TODO: make these work
///return archive size
int get_archive_size(){ return this->archive.individuals.size(); };

///return population as string
vector<json> get_archive(bool front);

// /// predict on unseen data from the whole archive
// VectorXf predict_archive(int id, MatrixXf& X);
// VectorXf predict_archive(int id, MatrixXf& X, LongData& Z);
// ArrayXXf predict_proba_archive(int id, MatrixXf& X, LongData& Z);
// ArrayXXf predict_proba_archive(int id, MatrixXf& X);
/// predict on unseen data from the archive
auto predict_archive(int id, const Dataset& data);
auto predict_archive(int id, const Ref<const ArrayXXf>& X);

template <ProgramType P = T>
requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier))
auto predict_proba_archive(int id, const Dataset& data);
template <ProgramType P = T>
requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier))
auto predict_proba_archive(int id, const Ref<const ArrayXXf>& X);

// TODO: make these work
// VectorXf predict_archive(int id, const Ref<const ArrayXXf>& X, LongData& Z);
// ArrayXXf predict_proba_archive(int id, const Ref<const ArrayXXf>& X, LongData& Z);

/// train the model
void run(Dataset &d);

Parameters params; ///< hyperparameters of brush, which the user can interact
Individual<T> best_ind;

Archive<T> archive; ///< pareto front archive
private:
SearchSpace ss;

Expand All @@ -135,7 +145,6 @@ class Engine{
Log_Stats stats; ///< runtime stats

Timer timer; ///< start time of training
Archive<T> archive; ///< pareto front archive

bool is_fitted; ///< keeps track of whether fit was called.

Expand All @@ -146,10 +155,10 @@ class Engine{
};

// Only stuff to make new predictions or call fit again
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::Regressor>, params, best_ind);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::BinaryClassifier>,params, best_ind);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::MulticlassClassifier>,params, best_ind);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::Representer>,params, best_ind);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::Regressor>, params, best_ind, archive);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::BinaryClassifier>,params, best_ind, archive);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::MulticlassClassifier>,params, best_ind, archive);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine<PT::Representer>,params, best_ind, archive);

} // Brush

Expand Down
Loading

0 comments on commit 12a8772

Please sign in to comment.