diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index 1fad9fdf..ffeec6c1 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -1,241 +1,130 @@ -// /* FEAT -// copyright 2017 William La Cava -// license: GNU/GPL v3 -// */ - -// #include "evaluation.h" - -// // code to evaluate GP programs. -// namespace FT{ - -// using namespace Opt; - -// namespace Eval{ - -// Evaluation::Evaluation(string scorer): S(scorer) -// { -// this->S.set_scorer(scorer); -// } - -// Evaluation::~Evaluation(){} - -// void Evaluation::validation(vector& individuals, -// const Data& d, -// const Parameters& params, -// bool offspring -// ) -// { -// unsigned start =0; -// if (offspring) -// start = individuals.size()/2; - -// // loop through individuals -// /* #pragma omp parallel for */ -// for (unsigned i = start; i yhat = ind.predict(d); -// // assign aggregate fitness -// logger.log("Assigning fitness to ind " + to_string(i) -// + ", eqn: " + ind.get_eqn(), 3); - -// if (!pass) -// { - -// ind.fitness_v = MAX_FLT; -// } -// else -// { -// // assign fitness to individual -// VectorXf loss; -// ind.fitness_v = this->S.score(d.y, yhat, loss, -// params.class_weights); -// } -// } -// } -// // fitness of population -// void Evaluation::fitness(vector& individuals, -// const Data& d, -// const Parameters& params, -// bool offspring) -// { -// /*! -// * @param individuals: population -// * @param d: Data structure -// * @param params: algorithm parameters -// * @param offspring: if true, only evaluate last half of population - -// * Output - -// * individuals.fitness, yhat, error is modified -// */ - -// unsigned start =0; -// if (offspring) start = individuals.size()/2; - -// /* for (unsigned i = start; i yhat = ind.fit(d,params,pass); -// // assign F and aggregate fitness -// logger.log("Assigning fitness to ind " + to_string(i) -// + ", eqn: " + ind.get_eqn(), 3); - -// if (!pass) -// { - -// ind.fitness = MAX_FLT; -// ind.error = MAX_FLT*VectorXf::Ones(d.y.size()); -// } -// else -// { -// // assign weights to individual -// assign_fit(ind,yhat,d,params,false); - - -// if (params.hillclimb) -// { -// HillClimb hc(params.scorer_, params.hc.iters, -// params.hc.step); -// bool updated = false; -// shared_ptr yhat2 = hc.run(ind, d, params, -// updated); -// // update the fitness of this individual -// if (updated) -// { -// assign_fit(ind, yhat2, d, params); -// } - -// } -// } -// } -// } +#include "evaluation.h" + +namespace Brush{ +namespace Eval{ + + +template +void Evaluation::validation(Population& pop, + tuple island_range, + const Dataset& data, + const Parameters& params, + bool offspring + ) +{ + // if offspring false --> if has offspring, do it on first half. else, do on entire island + // offspring true --> assert that has offspring, do it on the second half of the island + + auto [idx_start, idx_end] = island_range; + size_t delta = idx_end - idx_start; + if (offspring) + { + assert(pop.offspring_ready + && ("Population does not have offspring to calculate validation fitness")); + + idx_start = idx_start + (delta/2); + } + else if (pop.offspring_ready) // offspring is false. We need to see where we sould stop + { + idx_end = idx_end - (delta/2); + } + + for (unsigned i = idx_start; i& ind = pop[i]; -// // assign fitness to program -// void Evaluation::assign_fit(Individual& ind, -// const shared_ptr& yhat, const Data& d, -// const Parameters& params, bool val) -// { -// /*! -// * assign raw errors and aggregate fitnesses to individuals. -// * -// * Input: -// * -// * ind: individual -// * yhat: predicted output of ind -// * d: data -// * params: feat parameters -// * -// * Output: -// * -// * modifies individual metrics -// */ -// VectorXf loss; -// float f = S.score(d.y, yhat, loss, params.class_weights); -// //TODO: add if condition for this -// float fairness = marginal_fairness(loss, d, f); + // if there is no validation data, + // set fitness_v to fitness and return ( this assumes that fitness on train was calculated previously.) + if (!data.use_validation) + { + ind.fitness_v = ind.fitness; + continue; + } + + bool pass = true; + + if (!pass) + { + // TODO: stop doing this hardcoded? + ind.fitness_v = MAX_FLT; + } + else + { + // TODO: implement the class weights and use it here (and on fitness) + auto y_pred = ind.predict(data.get_validation_data); + assign_fit(ind, y_pred, data, params, true); + } + } +} + +// fitness of population +template +void Evaluation::fitness(Population& pop, + tuple island_range, + const Dataset& data, + const Parameters& params, + bool offspring + ) +{ + // if offspring false --> if has offspring, do it on first half. else, do on entire island + // offspring true --> assert that has offspring, do it on the second half of the island + + auto [idx_start, idx_end] = island_range; + size_t delta = idx_end - idx_start; + if (offspring) + { + assert(pop.offspring_ready + && ("Population does not have offspring to calculate validation fitness")); + + idx_start = idx_start + (delta/2); + } + else if (pop.offspring_ready) // offspring is false. We need to see where we sould stop + { + idx_end = idx_end - (delta/2); + } + + for (unsigned i = idx_start; i& ind = pop[i]; + + bool pass = true; + + if (!pass) + { + ind.fitness = MAX_FLT; + ind.error = MAX_FLT*VectorXf::Ones(data.y.size()); + } + else + { + // assign weights to individual + ind.fit(data); -// if (fairness <0 ) -// { -// cout << "fairness is " << fairness << "...\n"; -// } -// if (val) -// { -// ind.fitness_v = f; -// ind.fairness_v = fairness; -// } -// else -// { -// ind.fitness = f; -// ind.fairness = fairness; -// ind.error = loss; -// } - -// logger.log("ind " + std::to_string(ind.id) + " fitness: " -// + std::to_string(ind.fitness),3); -// } - -// float Evaluation::marginal_fairness(VectorXf& loss, const Data& d, -// float base_score, bool use_alpha) -// { -// // averages the deviation of the loss function from average loss -// // over k -// float avg_score = 0; -// float count = 0; -// float alpha = 1; - -// ArrayXb x_idx; - -// for (const auto& pl : d.protect_levels) -// { -// for (const auto& lvl : pl.second) -// { -// x_idx = (d.X.row(pl.first).array() == lvl); -// float len_g = x_idx.count(); -// if (use_alpha) -// alpha = len_g/d.X.cols(); -// /* cout << "alpha = " << len_g << "/" -// * << d.X.cols() << endl; */ -// float Beta = fabs(base_score - -// x_idx.select(loss,0).sum()/len_g); -// /* cout << "Beta = |" << base_score << " - " */ -// /* << x_idx.select(loss,0).sum() << "/" */ -// /* << len_g << "|" << endl; */ -// avg_score += alpha * Beta; -// ++count; -// } - -// } -// avg_score /= count; -// if (std::isinf(avg_score) -// || std::isnan(avg_score) -// || avg_score < 0) -// return MAX_FLT; - -// return avg_score; - -// } -// } -// } + auto y_pred = ind.predict(data.get_training_data); + assign_fit(ind, y_pred, data, params, false); + } + } +} + +// assign fitness to program +template +void Evaluation::assign_fit(Individual& ind, + VectorXf& y_pred, const Dataset& data, + const Parameters& params, bool val) +{ + VectorXf loss; + + float f = S.score(data.y, y_pred, loss, params.class_weights); + + if (val) + { + ind.fitness_v = f; + } + else + { + ind.fitness = f; + ind.error = loss; + } +} + +} // Pop +} // Brush \ No newline at end of file diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index b9dc1e1a..02885ba8 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -8,6 +8,8 @@ #include "../individual.h" #include "../program/program.h" #include "../data/data.h" +#include "scorer.h" +#include "../population.h" using std::string; @@ -20,34 +22,36 @@ namespace Eval { template class Evaluation { public: - Evaluation(string scorer=""); - ~Evaluation(); + Scorer S; + + Evaluation(string scorer="mse"): S(scorer) { this->S.set_scorer(scorer); }; + ~Evaluation(){}; // TODO: IMPLEMENT THIS /// validation of population. - void validation(vector>& individuals, + void validation(Population& pop, + tuple island_range, const Dataset& data, const Parameters& params, bool offspring = false ); - - // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING + // TODO: EVALUATOR CALCULATE ERROR BASED ON TEMPLATING? (caps) /// fitness of population. - void fitness(vector>& individuals, - const Dataset& data, - const Parameters& params, - bool offspring = false - ); + void fitness(Population& pop, + tuple island_range, + const Dataset& data, + const Parameters& params, + bool offspring = false + ); // TODO: implement other eval methods + /// assign fitness to an individual. - // void assign_fit(Individual& ind, - // const Dataset& data, - // const Parameters& params,bool val=false); + void assign_fit(Individual& ind, VectorXf& y_pred, + const Dataset& data, const Parameters& params, bool val=false); - // Scorer S; }; } //selection diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp new file mode 100644 index 00000000..8375dc26 --- /dev/null +++ b/src/eval/metrics.cpp @@ -0,0 +1,20 @@ +#include "metrics.h" + +namespace Brush { +namespace Eval { + +/* Scoring functions */ + +/// mean squared error +float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, + const vector& weights) +{ + loss = (yhat - y).array().pow(2); + return loss.mean(); +} + + +// TODO: implement other metrics. Right know I have just the MSE + +} // metrics +} // Brush \ No newline at end of file diff --git a/src/eval/metrics.h b/src/eval/metrics.h index e69de29b..e640d19c 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -0,0 +1,20 @@ +#ifndef METRICS_H +#define METRICS_H + +#include "../data/data.h" + +namespace Brush { +namespace Eval { + +/* Scoring functions */ + +/// mean squared error +float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, + const vector& weights=vector() ); + +// TODO: implement other metrics. Right know I have just the MSE + +} // metrics +} // Brush + +#endif \ No newline at end of file diff --git a/src/eval/scorer.h b/src/eval/scorer.h index e69de29b..eb3f2298 100644 --- a/src/eval/scorer.h +++ b/src/eval/scorer.h @@ -0,0 +1,65 @@ +#ifndef SCORER_H +#define SCORER_H + +#include "metrics.h" +#include "../util/error.h" + +// code to evaluate GP programs. +namespace Brush{ +namespace Eval{ + +typedef float (*funcPointer)(const VectorXf&, + const VectorXf&, + VectorXf&, + const vector&); + +class Scorer +{ +public: + // map the string into a function to be called when calculating the score + std::map score_hash; + string scorer; + + // TODO: add more scores, include them here, add to score_hash + Scorer(string scorer="mse") { + score_hash["mse"] = &mse; + + this->set_scorer(scorer); + }; + + void set_scorer(string scorer){ this->scorer = scorer; }; + + /* void set_scorer(string scorer); */ + float score(const VectorXf& y_true, VectorXf& y_pred, + VectorXf& loss, const vector& w) + { + // loss is an array passed by reference to store each prediction (used in lexicase) + // weights are used to give more or less importance for a given sample. + // Every scorer must have the same function signature, but arent required to use all info + + if ( score_hash.find(this->scorer) == score_hash.end() ) + { + // not found + HANDLE_ERROR_THROW("Scoring function '" + this->scorer + + "' not defined"); + return 0.0; + } + else + { + // found + return score_hash.at(this->scorer)(y_true, y_pred, loss, w); + } + }; + + // overloaded score with no loss + float score(const VectorXf& y_true, VectorXf& y_pred, + vector w=vector()) + { + VectorXf dummy; + return this->score(y_true, y_pred, dummy, w); + }; +}; + +} +} +#endif diff --git a/src/individual.h b/src/individual.h index e13bf1ec..9a72e5d8 100644 --- a/src/individual.h +++ b/src/individual.h @@ -27,11 +27,12 @@ class Individual{ public: Individual() - { // TODO: calculate this stuff + { fitness = -1; fitness_v = -1; complexity=-1; + dcounter=-1; rank=-1; crowd_dist = -1; diff --git a/src/population.cpp b/src/population.cpp index f77e118d..7f60a617 100644 --- a/src/population.cpp +++ b/src/population.cpp @@ -90,6 +90,11 @@ void Population::prep_offspring_slots() this->individuals = &expanded_pop; offspring_ready = true; + + // Im keeping the offspring and parents in the same population object, because we + // have operations that require them together (archive, hall of fame.) + // The downside is having to be aware that islands will create offsprings + // intercalated with other islands } template diff --git a/src/variation.cpp b/src/variation.cpp index ff6b69ba..fc9ac2d1 100644 --- a/src/variation.cpp +++ b/src/variation.cpp @@ -578,14 +578,19 @@ void Variation::vary(Population& pop, tuple island_range, auto [idx_start, idx_end] = island_range; size_t delta = idx_end - idx_start; - size_t vary_start = delta/2; + + idx_start = idx_start + (delta/2); // TODO: fix pragma omp usage //#pragma omp parallel for - for (unsigned i = vary_start; i> opt=std::nullopt; // new individual + std::optional> opt=std::nullopt; // new individual + // TODO: do it a certain number of times. after that, assume that variation cant + // change individual and add it to the island failures + // TODO: use island failures everytime that I'm iterating on the offspring of an + // island (with island range) while (!opt) { Individual& mom = pop.individuals.at(