From 9375c8844ade7066437fc16620b4d3cbeefa0002 Mon Sep 17 00:00:00 2001 From: gAldeia Date: Wed, 5 Jun 2024 15:09:59 -0300 Subject: [PATCH] more cleaning. Implemented string representation for fitness --- src/bindings/bind_fitness.cpp | 32 +++++++------- src/bindings/bind_individuals.h | 5 +++ src/bindings/bind_selection.cpp | 6 --- src/bindings/bind_selection.h | 16 ++++--- src/bindings/bind_variation.cpp | 7 +--- src/bindings/bind_variation.h | 31 ++++++++------ src/data/data.h | 2 + src/eval/metrics.h | 2 - src/ind/fitness.cpp | 74 ++++++++++++++++++++++++++++++++- src/ind/fitness.h | 33 +++++++-------- src/ind/individual.cpp | 69 ------------------------------ src/ind/individual.h | 6 ++- src/pop/population.h | 35 +--------------- src/program/node.h | 1 - src/util/utils.h | 48 +++++++++++++++++++++ src/vary/search_space.cpp | 36 ---------------- 16 files changed, 193 insertions(+), 210 deletions(-) diff --git a/src/bindings/bind_fitness.cpp b/src/bindings/bind_fitness.cpp index 8b031b10..c483acfc 100644 --- a/src/bindings/bind_fitness.cpp +++ b/src/bindings/bind_fitness.cpp @@ -30,21 +30,21 @@ void bind_fitness(py::module& m) .def("__gt__", &br::Fitness::operator>, py::is_operator()) .def("__le__", &br::Fitness::operator<=, py::is_operator()) .def("__ge__", &br::Fitness::operator>=, py::is_operator()) - // .def("__str__", &br::Fitness::toString, "String representation of the Fitness object") - // .def("__repr__", &br::Fitness::repr, "Representation for debugging the Fitness object") - .def(py::pickle( - [](const br::Fitness &f) { // __getstate__ - /* Return a tuple that fully encodes the state of the object */ - // return py::make_tuple(p.value(), p.extra()); - nl::json j = f; - return j; - }, - [](nl::json j) { // __setstate__ - br::Fitness f = j; - return f; - } - ) - ) - ; + .def("__str__", &br::Fitness::toString, "String representation of the Fitness object") + .def("__repr__", &br::Fitness::repr, "Representation for debugging the Fitness object") + .def(py::pickle( + [](const br::Fitness &f) { // __getstate__ + /* Return a tuple that fully encodes the state of the object */ + // return py::make_tuple(p.value(), p.extra()); + nl::json j = f; + return j; + }, + [](nl::json j) { // __setstate__ + br::Fitness f = j; + return f; + } + ) + ) + ; } \ No newline at end of file diff --git a/src/bindings/bind_individuals.h b/src/bindings/bind_individuals.h index 5c5b62a8..d8808bea 100644 --- a/src/bindings/bind_individuals.h +++ b/src/bindings/bind_individuals.h @@ -36,6 +36,11 @@ void bind_individual(py::module& m, string name) .def_property("objectives", &Class::get_objectives, &Class::set_objectives) .def_property_readonly("program", &Class::get_program) .def_property_readonly("fitness", &Class::get_fitness) + .def("get_model", &Class::get_model, + py::arg("fmt") = "compact", + py::arg("pretty") = false) + .def("get_dot_model", &Class::get_dot_model, + py::arg("extras") = "") .def("fit", static_cast(&Class::fit), "fit from Dataset object") diff --git a/src/bindings/bind_selection.cpp b/src/bindings/bind_selection.cpp index f8a8641c..427ead9e 100644 --- a/src/bindings/bind_selection.cpp +++ b/src/bindings/bind_selection.cpp @@ -5,14 +5,8 @@ namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; -// using Reg = br::Program; -// using Cls = br::Program; -// using Rep = br::Program; -// using MCls = br::Program; - void bind_selections(py::module& m) { - // TODO: make them a single class bind_selection(m, "RegressorSelector"); bind_selection(m, "ClassifierSelector"); diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index 2d1ed49f..b8c9d45b 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -1,4 +1,5 @@ #include "module.h" + // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) #include "../selection/selection.h" #include "../selection/selection.cpp" @@ -12,9 +13,6 @@ #include "../pop/population.cpp" #include "../pop/population.h" -// #include "../individual.h" -//#include "../selection/selection.cpp" - namespace py = pybind11; namespace nl = nlohmann; namespace br = Brush; @@ -31,7 +29,8 @@ void bind_selection(py::module& m, string name) .def(py::init( [](string type, bool survival){ Class s(type, survival); return s; }) ) - .def("select", [](Class &self, std::vector>& individuals, + .def("select", [](Class &self, + std::vector>& individuals, const Parameters& params) { // auto sel = Class("nsga2", false); @@ -53,10 +52,10 @@ void bind_selection(py::module& m, string name) } } - // returns references return pool; }) - .def("survive", [](Class &self, std::vector>& individuals, + .def("survive", [](Class &self, + std::vector>& individuals, const Parameters& params) { // auto sel = Class("nsga2", false); @@ -76,10 +75,10 @@ void bind_selection(py::module& m, string name) } } - // returns references return pool; }) - .def("migrate", [](Class &self, std::vector>& individuals, + .def("migrate", [](Class &self, + std::vector>& individuals, const Parameters& params) { auto pop = br::Pop::Population(); @@ -98,7 +97,6 @@ void bind_selection(py::module& m, string name) pool.push_back(pop[idx]); } } - // returns references return pool; }) ; diff --git a/src/bindings/bind_variation.cpp b/src/bindings/bind_variation.cpp index 0a772c7c..739d115e 100644 --- a/src/bindings/bind_variation.cpp +++ b/src/bindings/bind_variation.cpp @@ -5,14 +5,9 @@ namespace py = pybind11; namespace br = Brush; namespace nl = nlohmann; -// using Reg = br::Program; -// using Cls = br::Program; -// using Rep = br::Program; -// using MCls = br::Program; - void bind_variations(py::module& m) { - bind_variation(m, "RegressorVariator"); + bind_variation(m,"RegressorVariator"); bind_variation(m, "ClassifierVariator"); bind_variation(m, "MultiClassifierVariator"); bind_variation(m, "RepresenterVariator"); diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index fe697c95..88cacc3a 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,9 +1,9 @@ #include "module.h" -#include "../vary/variation.h" -#include "../vary/variation.cpp" // TODO: figure out why im having symbol errors (if i dont include the cpp here as well) -#include "../pop/population.cpp" +#include "../vary/variation.h" +#include "../vary/variation.cpp" #include "../pop/population.h" +#include "../pop/population.cpp" namespace py = pybind11; namespace nl = nlohmann; @@ -22,10 +22,17 @@ void bind_variation(py::module& m, string name) return variation; })) .def("mutate", &Class::mutate, py::return_value_policy::automatic) .def("cross", &Class::cross, py::return_value_policy::automatic) - .def("vary_pop", [](Class &self, std::vector>& individuals, const Parameters& params) { - + .def("vary_pop", [](Class &self, + std::vector>& individuals, + const Parameters& params) { if (individuals.size() != params.pop_size) { - throw std::runtime_error("Individual vector has different number of individuals than pop_size. When calling variation, they should be the same. popsize is "+to_string(params.pop_size)+", number of individuals is " + to_string(individuals.size())); + string msg = "Individual vector has different number of " + "individuals than pop_size. When calling " + "variation, they should be the same. popsize is "+ + to_string(params.pop_size)+", number of " + "individuals is "+to_string(individuals.size()); + + throw std::runtime_error(msg); } auto pop = br::Pop::Population(); @@ -37,10 +44,12 @@ void bind_variation(py::module& m, string name) for (int island = 0; island < params.num_islands; ++island) { - // I am assuming the individual vector passed as argument will contain the selected parents already + // I am assuming the individual vector passed as argument + // will contain the selected parents already vector parents = pop.get_island_indexes(island); - // including offspring indexes (the vary method will store the offspring in the second half of the index vector) + // including offspring indexes (the vary method will store the + // offspring in the second half of the index vector) pop.add_offspring_indexes(island); self.vary(pop, island, parents, params); @@ -53,10 +62,8 @@ void bind_variation(py::module& m, string name) // this is where the offspring is saved pool.push_back(pop[indices.at(i)]); } - } - - // returns references + } return pool; }) ; -} \ No newline at end of file +} diff --git a/src/data/data.h b/src/data/data.h index 7dde291b..e145957d 100644 --- a/src/data/data.h +++ b/src/data/data.h @@ -98,6 +98,8 @@ class Dataset const vector& vn = {} ); + // TODO: let the user specify the datatypes + /// turns input into a feature map, with feature types copied from a reference map copy_and_make_features(const ArrayXXf& X, const Dataset& ref_dataset, diff --git a/src/eval/metrics.h b/src/eval/metrics.h index fab075d1..5f4439f9 100644 --- a/src/eval/metrics.h +++ b/src/eval/metrics.h @@ -12,8 +12,6 @@ namespace Eval { float mse(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, const vector& class_weights=vector() ); -// TODO: test cases for the metrics - /// log loss (2 methods below) VectorXf log_loss(const VectorXf& y, const VectorXf& predict_proba, const vector& class_weights=vector()); diff --git a/src/ind/fitness.cpp b/src/ind/fitness.cpp index 9e2de1ca..e3bd2d59 100644 --- a/src/ind/fitness.cpp +++ b/src/ind/fitness.cpp @@ -1 +1,73 @@ -#include "fitness.h" \ No newline at end of file +#include "fitness.h" + +namespace Brush +{ + +void to_json(json &j, const Fitness &f) +{ + j = json{ + {"values", f.values}, + {"weights", f.weights}, + {"wvalues", f.wvalues}, + {"loss", f.loss}, + {"loss_v", f.loss_v}, + {"complexity", f.complexity}, + {"size", f.size}, + {"depth", f.depth}, + {"dcounter", f.dcounter}, + {"dominated", f.dominated}, + {"rank", f.rank}, + {"crowding_dist", f.crowding_dist} + }; +} + +void from_json(const json &j, Fitness& f) +{ + j.at("values").get_to( f.values ); + j.at("weights").get_to( f.weights ); + j.at("wvalues").get_to( f.wvalues ); + j.at("loss").get_to( f.loss ); + j.at("loss_v").get_to( f.loss_v ); + j.at("complexity").get_to( f.complexity ); + j.at("size").get_to( f.size ); + j.at("depth").get_to( f.depth ); + j.at("dcounter").get_to( f.dcounter ); + j.at("dominated").get_to( f.dominated ); + j.at("rank").get_to( f.rank ); + j.at("crowding_dist").get_to( f.crowding_dist ); +} + + +int Fitness::dominates(const Fitness& b) const +{ + int flag1 = 0, // to check if this has a better objective + flag2 = 0; // to check if b has a better objective + + // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) + for (int i=0; i b.get_wvalues().at(i) + || std::isnan(b.get_wvalues().at(i)) + ) + flag1 = 1; + if (get_wvalues().at(i) < b.get_wvalues().at(i) + || std::isnan(get_wvalues().at(i)) + ) + flag2 = 1; + } + + // the proper way of comparing weighted values is considering everything as a maximization problem + // (this is like deap does, and our fitness is inspired by them) + if (flag1==1 && flag2==0) + // there is at least one smaller objective for this and none + // for b + return 1; + else if (flag1==0 && flag2==1) + // there is at least one smaller objective for b and none + // for this + return -1; + else + // no smaller objective or both have one smaller + return 0; +} + +} // Brush \ No newline at end of file diff --git a/src/ind/fitness.h b/src/ind/fitness.h index abbcccf4..db0885e8 100644 --- a/src/ind/fitness.h +++ b/src/ind/fitness.h @@ -3,25 +3,15 @@ #include #include "../init.h" - +#include "../util/utils.h" using namespace nlohmann; - -template <> // this is intended to be used with DEAP (so our brush individuals can be hashed and compared to each other in python side) -struct std::hash> { - std::size_t operator()(const std::vector& v) const { - std::size_t seed = v.size(); - for (const auto& elem : v) { - seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - return seed; - } -}; - namespace Brush{ + struct Fitness { // the loss is used in evolutionary functions + float loss; ///< aggregate loss score float loss_v; ///< aggregate validation loss score @@ -163,18 +153,27 @@ struct Fitness { // String representation std::string toString() const { if (valid()) { - return "TODO: implement string representation"; //std::to_string(wvalues); + string s = "Fitness("; + for (auto& v : values) + s += to_string(v) + " "; + return s+")"; } else { - return "Tuple()"; + return "Fitness()"; } } // Representation for debugging std::string repr() const { - return "TODO: implement string representation"; + if (valid()) { + string s = "Fitness("; + for (auto& v : values) + s += to_string(v) + " "; + return s+")"; + } else { + return "Fitness()"; + } } - /// set obj vector given a string of objective names int dominates(const Fitness& b) const; }; diff --git a/src/ind/individual.cpp b/src/ind/individual.cpp index 6d48b2a7..a08668c0 100644 --- a/src/ind/individual.cpp +++ b/src/ind/individual.cpp @@ -1,75 +1,6 @@ #include "individual.h" namespace Brush{ - -void to_json(json &j, const Fitness &f) -{ - j = json{ - {"values", f.values}, - {"weights", f.weights}, - {"wvalues", f.wvalues}, - {"loss", f.loss}, - {"loss_v", f.loss_v}, - {"complexity", f.complexity}, - {"size", f.size}, - {"depth", f.depth}, - {"dcounter", f.dcounter}, - {"dominated", f.dominated}, - {"rank", f.rank}, - {"crowding_dist", f.crowding_dist} - }; -} - -void from_json(const json &j, Fitness& f) -{ - j.at("values").get_to( f.values ); - j.at("weights").get_to( f.weights ); - j.at("wvalues").get_to( f.wvalues ); - j.at("loss").get_to( f.loss ); - j.at("loss_v").get_to( f.loss_v ); - j.at("complexity").get_to( f.complexity ); - j.at("size").get_to( f.size ); - j.at("depth").get_to( f.depth ); - j.at("dcounter").get_to( f.dcounter ); - j.at("dominated").get_to( f.dominated ); - j.at("rank").get_to( f.rank ); - j.at("crowding_dist").get_to( f.crowding_dist ); -} - - -int Fitness::dominates(const Fitness& b) const -{ - int flag1 = 0, // to check if this has a better objective - flag2 = 0; // to check if b has a better objective - - // TODO: replace comparison of individual values by using the overloaded operators (here and in nsga2) - for (int i=0; i b.get_wvalues().at(i) - || std::isnan(b.get_wvalues().at(i)) - ) - flag1 = 1; - if (get_wvalues().at(i) < b.get_wvalues().at(i) - || std::isnan(get_wvalues().at(i)) - ) - flag2 = 1; - } - - // the proper way of comparing weighted values is considering everything as a maximization problem - // (this is like deap does, and our fitness is inspired by them) - if (flag1==1 && flag2==0) - // there is at least one smaller objective for this and none - // for b - return 1; - else if (flag1==0 && flag2==1) - // there is at least one smaller objective for b and none - // for this - return -1; - else - // no smaller objective or both have one smaller - return 0; -} - - namespace Pop{ diff --git a/src/ind/individual.h b/src/ind/individual.h index f411bd75..472b9f05 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -83,11 +83,15 @@ class Individual{ // just getters bool get_is_fitted() const { return this->is_fitted_; }; - string get_model() const { return program.get_model(); }; unsigned int get_size() const { return program.size(); }; unsigned int get_depth() const { return program.depth(); }; unsigned int get_complexity() const { return program.complexity(); }; Program& get_program() { return program; }; + + string get_model(string fmt="compact", bool pretty=false) { + return program.get_model(fmt, pretty); }; + string get_dot_model(string extras="") { + return program.get_dot_model(extras); }; void set_fitness(Fitness &f) { fitness=f; }; Fitness& get_fitness() { return fitness; }; diff --git a/src/pop/population.h b/src/pop/population.h index 98cf5d7f..6871c6e8 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -1,43 +1,10 @@ #ifndef POPULATION_H #define POPULATION_H +#include "../util/utils.h" #include "../util/error.h" #include "../ind/individual.h" -// TODO: move this serialization elsewhere -// serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 -// (this is used by population, which has a shared_ptr vector) -namespace nlohmann -{ -template -struct adl_serializer> -{ - static void to_json(json& j, const std::shared_ptr& opt) - { - if (opt) - { - j = *opt; - } - else - { - j = nullptr; - } - } - - static void from_json(const json& j, std::shared_ptr& opt) - { - if (j.is_null()) - { - opt = nullptr; - } - else - { - opt.reset(new T(j.get())); - } - } -}; -} - namespace Brush { namespace Pop { diff --git a/src/program/node.h b/src/program/node.h index 8092664b..a6265f31 100644 --- a/src/program/node.h +++ b/src/program/node.h @@ -39,7 +39,6 @@ using Brush::Data::Dataset; namespace Brush{ -// TODO: should I move this declaration to another place? template inline auto Isnt(DataType dt) -> bool { return !((dt == T) || ...); } diff --git a/src/util/utils.h b/src/util/utils.h index 932a0cca..f767e653 100644 --- a/src/util/utils.h +++ b/src/util/utils.h @@ -27,6 +27,54 @@ using namespace std; * @brief namespace containing various utility functions */ +// serializing vector of shared ptr: https://github.com/nlohmann/json/discussions/2377 +// (used in population.h, which has a shared_ptr vector) +namespace nlohmann +{ +template +struct adl_serializer> +{ + static void to_json(json& j, const std::shared_ptr& opt) + { + if (opt) + { + j = *opt; + } + else + { + j = nullptr; + } + } + + static void from_json(const json& j, std::shared_ptr& opt) + { + if (j.is_null()) + { + opt = nullptr; + } + else + { + opt.reset(new T(j.get())); + } + } +}; +} + +// to overload operators and compare our individuals, we need to be able to +// serialize vectors. +// this is intended to be used with DEAP (so our brush individuals +// can be hashed and compared to each other in python side) +template <> +struct std::hash> { + std::size_t operator()(const std::vector& v) const { + std::size_t seed = v.size(); + for (const auto& elem : v) { + seed ^= std::hash{}(elem) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + // namespace std // { diff --git a/src/vary/search_space.cpp b/src/vary/search_space.cpp index 943f3821..95a9cf0b 100644 --- a/src/vary/search_space.cpp +++ b/src/vary/search_space.cpp @@ -276,10 +276,6 @@ tree& SearchSpace::PTC2(tree& Tree, // parameters, the real maximum size that can occur is `max_size` plus the // highest operator arity, and the real maximum depth is `max_depth` plus one. - // auto Tree = tree(); - - // fmt::print("building program with max size {}, max depth {}",max_size,max_d); - // Queue of nodes that need children vector> queue; @@ -290,9 +286,6 @@ tree& SearchSpace::PTC2(tree& Tree, Node root = spot.node->data; - // cout << "root " << root.name << endl; - // auto spot = Tree.set_head(n); - // updating size accordingly to root node if (Is(root.node_type)) s += 3; @@ -315,8 +308,6 @@ tree& SearchSpace::PTC2(tree& Tree, Node n; // Now we actually start the PTC2 procedure to create the program tree - // cout << "queue size: " << queue.size() << endl; - // cout << "entering first while loop...\n"; while ( queue.size() + s < max_size && queue.size() > 0) { // including the queue size in the max_size, since each element in queue @@ -334,12 +325,6 @@ tree& SearchSpace::PTC2(tree& Tree, // cout << "current depth: " << d << endl; if (d >= max_d || s >= max_size) { - // choose terminal of matching type - // cout << "getting " << DataTypeName[t] << " terminal\n"; - // qspot = sample_terminal(t); - // Tree.replace(qspot, sample_terminal(t)); - // Tree.append_child(qspot, sample_terminal(t)); - auto opt = sample_terminal(t); // if it returned optional, then there's nothing to sample based on weights. @@ -355,11 +340,7 @@ tree& SearchSpace::PTC2(tree& Tree, else { //choose a nonterminal of matching type - // cout << "getting op of type " << DataTypeName[t] << endl; auto opt = sample_op(t); - // cout << "chose " << n.name << endl; - // TreeIter new_spot = Tree.append_child(qspot, n); - // qspot = n; if (!opt) { // there is no operator for this node. sample a terminal instead opt = sample_terminal(t); @@ -380,8 +361,6 @@ tree& SearchSpace::PTC2(tree& Tree, // For each arg of n, add to queue for (auto a : n.arg_types) { - // cout << "queing a node of type " << DataTypeName[a] << endl; - // queue.push_back(make_tuple(new_spot, a, d+1)); auto child_spot = Tree.append_child(newspot); queue.push_back(make_tuple(child_spot, a, d+1)); @@ -399,25 +378,15 @@ tree& SearchSpace::PTC2(tree& Tree, if ( n.get_is_weighted()==true && Isnt(n.node_type) ) s += 2; - - // cout << "current tree size: " << s << endl; } - // cout << "entering second while loop...\n"; while (queue.size() > 0) { if (queue.size() == 0) break; - // cout << "queue size: " << queue.size() << endl; - auto [qspot, t, d] = RandomDequeue(queue); - // cout << "getting " << DataTypeName[t] << " terminal\n"; - // Tree.append_child(qspot, sample_terminal(t)); - // qspot = sample_terminal(t); - // auto newspot = Tree.replace(qspot, sample_terminal(t)); - auto opt = sample_terminal(t); if (!opt) opt = sample_terminal(t, true); @@ -426,11 +395,6 @@ tree& SearchSpace::PTC2(tree& Tree, auto newspot = Tree.replace(qspot, n); } - - // cout << "final tree:\n" - // << Tree.begin().node->get_model() << "\n" - // << Tree.begin().node->get_tree_model(true) << endl; - return Tree; };