Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
engine.cpp
Go to the documentation of this file.
1#include "engine.h"
2
3#include <iostream>
4#include <fstream>
5
6namespace Brush{
7
8using namespace Pop;
9using namespace Sel;
10using namespace Eval;
11using namespace Var;
12using namespace MAB;
13
15template <ProgramType T>
17{
18 r.set_seed(params.get_random_state());
19
20 set_is_fitted(false);
21
22 this->pop = Population<T>();
23 this->evaluator = Evaluation<T>();
24 this->selector = Selection<T>(params.sel, false);
25 this->survivor = Selection<T>(params.surv, true);
26
27 this->archive.set_objectives(params.get_objectives());
28
29 timer.Reset();
30
31 // reset statistics
32 this->stats = Log_Stats();
33}
34
35template <ProgramType T>
36void Engine<T>::print_progress(float percentage)
37{
38 int val = (int) (percentage * 100);
39 int lpad = (int) (percentage * PBWIDTH);
40 int rpad = PBWIDTH - lpad;
41
42 printf ("\rCompleted %3d%% [%.*s%*s]", val, lpad, PBSTR.c_str(), rpad, "");
43
44 fflush (stdout);
45
46 if(val == 100)
47 cout << "\n";
48}
49
50
51template <ProgramType T>
53{
54 int pop_size = 0;
55 for (int island=0; island<params.num_islands; ++island)
56 {
57 auto indices = pop.island_indexes.at(island);
58 pop_size += indices.size();
59 }
60
61 ArrayXf scores(pop_size);
62 ArrayXf scores_v(pop_size);
64 // TODO: change all size_t to unsigned?
65 ArrayXi sizes(pop_size);
66 ArrayXi complexities(pop_size);
67
68 float error_weight = Individual<T>::weightsMap[params.scorer];
69
70 int index = 0;
71 for (int island=0; island<params.num_islands; ++island)
72 {
73 auto indices = pop.island_indexes.at(island);
74 for (unsigned int i=0; i<indices.size(); ++i)
75 {
76 const auto& p = this->pop.individuals.at(indices[i]);
78 // Fitness class will store every information that can be used as
79 // fitness. you just need to access them. Multiplying by weight
80 // so we can find best score. From Fitness::dominates:
81 // the proper way of comparing weighted values is considering
82 // everything as a maximization problem
83 scores(index) = p->fitness.get_loss();
84 scores_v(index) = p->fitness.get_loss_v();
85 sizes(index) = p->get_size();
86 complexities(index) = p->get_complexity();
87 ++index;
88 }
89 }
90
91 assert (pop_size == this->params.pop_size);
92
93 // Multiply by weight to make it a maximization problem.
94 // Then, multiply again to get rid of signal
95 float best_score = (scores*error_weight).maxCoeff()*error_weight;
96 float best_score_v = this->best_ind.fitness.get_loss_v();
97 float med_score = median(scores);
98 float med_score_v = median(scores_v);
99 unsigned med_size = median(sizes);
100 unsigned med_complexity = median(complexities);
101 unsigned max_size = sizes.maxCoeff();
102 unsigned max_complexity = complexities.maxCoeff();
103
104 // update stats
105 stats.update(params.current_gen,
106 timer.Elapsed().count(),
107 best_score,
108 best_score_v,
109 med_score,
110 med_score_v,
111 med_size,
112 med_complexity,
113 max_size,
114 max_complexity);
115}
118template <ProgramType T>
119void Engine<T>::log_stats(std::ofstream& log)
120{
121 // print stats in tabular format
122 string sep = ",";
123 if (params.current_gen == 0) // print header
124 {
125 log << "generation" << sep
126 << "time" << sep
127 << "best_score" << sep
128 << "best_score_val" << sep
129 << "med_score" << sep
130 << "med_score_val" << sep
131 << "med_size" << sep
132 << "med_complexity" << sep
133 << "max_size" << sep
134 << "max_complexity" << "\n";
135 }
136 log << params.current_gen << sep
137 << timer.Elapsed().count() << sep
138 << stats.best_score.back() << sep
139 << stats.best_score_v.back() << sep
140 << stats.med_score.back() << sep
141 << stats.med_score_v.back() << sep
142 << stats.med_size.back() << sep
143 << stats.med_complexity.back() << sep
144 << stats.max_size.back() << sep
145 << stats.max_complexity.back() << "\n";
146}
147
148template <ProgramType T>
149void Engine<T>::print_stats(std::ofstream& log, float fraction)
150{
151 // progress bar
152 string bar, space = "";
153 for (unsigned int i = 0; i<50; ++i)
154 {
155 if (i <= 50*fraction) bar += "/";
156 else space += " ";
157 }
158
159 std::cout.precision(5);
160 std::cout << std::scientific;
161
162 if(params.max_time == -1)
163 std::cout << "Generation " << params.current_gen+1 << "/"
164 << params.max_gens << " [" + bar + space + "]\n";
165 else
166 std::cout << std::fixed << "Time elapsed "<< timer
167 << "/" << params.max_time
168 << " seconds (Generation "<< params.current_gen+1
169 << ") [" + bar + space + "]\n";
170
171 std::cout << std::fixed
172 << "Best model on Val:" << best_ind.program.get_model() << "\n"
173 << "Train Loss (Med): " << stats.best_score.back() << " (" << stats.med_score.back() << ")\n"
174 << "Val Loss (Med): " << stats.best_score_v.back() << " (" << stats.med_score_v.back() << ")\n"
175 << "Median Size (Max): " << stats.med_size.back() << " (" << stats.max_size.back() << ")\n"
176 << "Median complexity (Max): " << stats.med_complexity.back() << " (" << stats.max_complexity.back() << ")\n"
177 << "Time (s): " << timer
178 <<"\n\n";
179}
180
181template <ProgramType T>
183{
184 vector<json> archive_vector;
185 for (const auto& ind : archive.individuals) {
186 json j;
187 to_json(j, ind);
188 archive_vector.push_back(j);
189 }
190
191 return archive_vector;
192}
193
194// TODO: dont have a get_pop and get_serialized_pop --> make them the same name but overloaded.
195// Also fix this in the pybind wrapper
196template <ProgramType T>
198{
199 vector<json> pop_vector;
200 for (const auto& ind : pop.individuals) {
201 if (ind == nullptr) {
202 // HANDLE_ERROR_THROW("get_population found a nullptr individual");
203 continue;
204 }
205
206 json j;
207 to_json(j, *ind);
208 pop_vector.push_back(j);
209 }
210
211 if(pop_vector.size() != params.pop_size)
212 HANDLE_ERROR_THROW("Population size is different from pop_size");
213
214 return pop_vector;
215}
216
217template <ProgramType T>
218vector<Individual<T>> Engine<T>::get_archive()
219{
220 vector<Individual<T>> archive_vector;
221 for (const auto& ind : archive.individuals) {
222 archive_vector.push_back(ind);
223 }
224
225 return archive_vector;
226}
227
228template <ProgramType T>
229vector<Individual<T>> Engine<T>::get_population()
230{
231 vector<Individual<T>> pop_vector;
232 for (const auto& ind : pop.individuals) {
233 if (ind == nullptr) {
234 continue;
235 }
236 pop_vector.push_back(*ind);
237 }
238
239 if(pop_vector.size() != params.pop_size)
240 HANDLE_ERROR_THROW("Population size is different from pop_size");
241
242 return pop_vector;
243}
244
245template <ProgramType T>
246void Engine<T>::set_population_from_json(vector<json> pop_vector)
247{
248 vector<Individual<T>> new_pop;
249
250 // load serialized individuals
251 for (const auto& ind_j : pop_vector) {
252 Individual<T> ind;
253
254 // deserialize individual
255 from_json(ind_j, ind);
256
257 // set reference to search space
259
260 new_pop.push_back(ind);
261 }
262
263 // check if size matches
264 if(new_pop.size() != params.pop_size)
265 HANDLE_ERROR_THROW("set_population size is different from params.pop_size");
266
267 // re-initialize population
268 this->pop.init(new_pop, params);
269}
270
271template <ProgramType T>
273{
274 vector<Individual<T>> new_pop;
275 for (const auto& ind_j : pop_vector) {
276 Individual<T> ind;
277 ind.program = ind_j.program;
278 ind.set_objectives(ind_j.get_objectives());
279
280 new_pop.push_back(ind);
281 }
282
283 if(new_pop.size() != params.pop_size)
284 HANDLE_ERROR_THROW("set_population size is different from params.pop_size");
285
286 this->pop.init(new_pop, params);
287}
288
289
290template <ProgramType T>
291void Engine<T>::lock_nodes(int end_depth, bool keep_leaves_unlocked)
292{
293 // iterate over the population, locking the program's tree nodes
294 for (int island=0; island<pop.num_islands; ++island) {
295 auto indices = pop.get_island_indexes(island);
296
297 for (unsigned i = 0; i<indices.size(); ++i)
298 {
299 const auto& ind = pop.individuals.at(indices.at(i));
300 ind->program.lock_nodes(end_depth, keep_leaves_unlocked);
301 }
302 }
303}
304
305template <ProgramType T>
307{
308 bool updated = false;
309 bool passed;
310
311 vector<size_t> merged_islands(0);
312 for (int j=0;j<pop.num_islands; ++j)
313 {
314 auto indices = pop.island_indexes.at(j);
315 for (int i=0; i<indices.size(); ++i)
316 {
317 merged_islands.push_back(indices.at(i));
318 }
319 }
320
321 for (int i=0; i < merged_islands.size(); ++i)
322 {
323 const auto& ind = *pop.individuals.at(merged_islands[i]);
324
325 // TODO: use intermediary variables for wvalues
326 // Iterate over the weighted values to compare (everything is a maximization problem here)
327 passed = false;
328 for (size_t j = 0; j < ind.fitness.get_wvalues().size(); ++j) {
329 if (ind.fitness.get_wvalues()[j] > this->best_ind.fitness.get_wvalues()[j]) {
330 passed = true;
331 break;
332 }
333 if (ind.fitness.get_wvalues()[j] < this->best_ind.fitness.get_wvalues()[j]) {
334 // it is not better, and it is also not equal. So, it is worse. Stop here.
335 break;
336 }
337 // if no break, then its equal, so we keep going
338 }
339
340 if (passed)
341 {
342 this->best_ind = ind;
343 updated = true;
344 }
345 }
346
347 return updated;
348}
349
350
351template <ProgramType T>
353{
354 // avoid re-initializing stuff so we can perform partial fits
355 if (!this->is_fitted){
356 //TODO: i need to make sure i initialize everything (pybind needs to have constructors
357 // without arguments to work, and i need to handle correcting these values before running)
358
359 // initializing classes that need data references
360 this->ss.init(data, params.functions, params.weights_init);
361
362 // TODO: make init to take necessary arguments and perform all initializations inside that function
363 this->init();
364
365 if (params.load_population != "") {
366 this->pop.load(params.load_population);
367 }
368 else if (this->pop.individuals.size() == 0)
369 {
370 // This only works because the Population constructor resizes individuals to zero.
371 this->pop.init(this->ss, this->params);
372 }
373 }
374
375 // TODO: Should I refit them? or use the values at it is? (the fitness WILL BE recalculated regardless)
376 // invalidating all individuals (so they are fitted with current data)
377 // for (auto& individual : this->pop.individuals) {
378 // if (individual != nullptr) {
379 // // will force re-fit and calc all fitness information
380 // individual->set_is_fitted(false);
381 // }
382 // }
383
384 // This is data dependent so we initialize it everytime, regardless of partial fit
385 // TODO: make variator have a default constructor and make it an attribute of engine
386 Variation<T> variator = Variation<T>(this->params, this->ss, data);
387
388 // log file stream
389 std::ofstream log;
390 if (!params.logfile.empty())
391 log.open(params.logfile, std::ofstream::app);
392
393 evaluator.set_scorer(params.scorer);
394
395 Dataset &batch = data;
396
397 int threads;
398 if (params.n_jobs == -1)
399 threads = std::thread::hardware_concurrency();
400 else if (params.n_jobs == 0)
401 threads = params.num_islands;
402 else
403 threads = params.n_jobs;
404
405 tf::Executor executor(threads);
406
407 assert( (executor.num_workers() > 0) && "Invalid number of workers");
408
409 tf::Taskflow taskflow;
410
411 // stop criteria
412 unsigned generation = 0;
413 unsigned stall_count = 0;
414 float fraction = 0;
415
416 auto stop = [&]() {
417 bool condition = ( (generation == params.max_gens)
418 || (params.max_stall != 0 && stall_count > params.max_stall)
419 || (params.max_time != -1 && timer.Elapsed().count() > params.max_time)
420 );
421
422 return condition;
423 };
424
425 // TODO: check that I dont use pop.size() (or I use correctly, because it will return the size with the slots for the offspring)
426 // vectors to store each island separatedly
427 vector<vector<size_t>> island_parents;
428
429 island_parents.clear();
430 island_parents.resize(pop.num_islands);
431
432 for (int i=0; i< params.num_islands; i++){
433 size_t idx_start = std::floor(i*params.pop_size/params.num_islands);
434 size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands);
435
436 auto delta = idx_end - idx_start;
437
438 island_parents.at(i).clear();
439 island_parents.at(i).resize(delta);
440 }
441
442 // heavily inspired in https://github.com/heal-research/operon/blob/main/source/algorithms/nsga2.cpp
443 auto [init, cond, body, back, done] = taskflow.emplace(
444 [&](tf::Subflow& subflow) {
445 auto fit_init_pop = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) {
446 // Evaluate the individuals at least once
447 // Set validation loss before calling update best
448
449 evaluator.update_fitness(this->pop, island, data, params, true, true);
450 });
451 auto find_init_best = subflow.emplace([&]() {
452 // Make sure we initialize it. We do this update here because we need to
453 // have the individuals fitted before we can compare them. When update_best
454 // is called, we are garanteed that the individuals are fitted and have valid
455 // fitnesses.
456 this->best_ind = *pop.individuals.at(0);
457 this->update_best(); // at this moment we dont care about update_best return value
458 });
459 fit_init_pop.precede(find_init_best);
460 }, // init (entry point for taskflow)
461
462 stop, // loop condition
463
464 [&](tf::Subflow& subflow) { // loop body (evolutionary main loop)
465 auto prepare_gen = subflow.emplace([&]() {
466 params.set_current_gen(generation);
467 batch = data.get_batch(); // will return the original dataset if it is set to dont use batch
468 }).name("prepare generation");// set generation in params, get batch
469
470 auto run_generation = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) {
471
472 evaluator.update_fitness(this->pop, island, data, params, false, false); // fit the weights with all training data
473
474 // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk)
475 // TODO: if using batch, fitness should be called before selection to set the batch
476 if (data.use_batch) // assign the batch error as fitness (but fit was done with training data)
477 evaluator.update_fitness(this->pop, island, batch, params, false, false);
478
479 vector<size_t> parents = selector.select(this->pop, island, params);
480 for (int i=0; i< parents.size(); i++){
481 island_parents.at(island).at(i) = parents.at(i);
482 }
483
484 this->pop.add_offspring_indexes(island);
485
486 }).name("runs one generation at each island in parallel");
487
488 auto update_pop = subflow.emplace([&]() { // sync point
489 // Variation is not thread safe.
490 // TODO: optimize this and make it work with multiple islands in parallel.
491 for (int island = 0; island < this->params.num_islands; ++island) {
492
493 // TODO: do I have to pass data as an argument here? or can I use the instance reference
494 variator.vary_and_update(this->pop, island, island_parents.at(island),
495 data, evaluator,
496
497 // conditions to apply simplification.
498 // It starts only on the second half of generations,
499 // and it is not applied every generation.
500 // Also, we garantee that the final generation
501 // will be simplified.
502 (generation>=params.max_gens/2) || (stall_count == params.max_stall-1)
503 );
504 }
505
506 // select survivors from combined pool of parents and offspring.
507 // if the same individual exists in different islands, then it will be selected several times and the pareto front will have less diversity.
508 // to avoid this, survive should be unified
509 // TODO: survivor should still take params?
510 // TODO: RETURN SINGLE VECTOR and stop wrapping it into a single-element vector
511
512 auto survivor_indices = survivor.survive(this->pop, 0, params);
513
514 // TODO: do i need these next this-> pointers?
515 variator.update_ss();
516 this->pop.update({survivor_indices});
517 this->pop.migrate();
518 }).name("update, migrate and disentangle indexes between islands");
519
520 auto finish_gen = subflow.emplace([&]() {
521 // Set validation loss before calling update best
522 for (int island = 0; island < this->params.num_islands; ++island) {
523 evaluator.update_fitness(this->pop, island, data, params, false, true);
524 }
525
526 archive.update(pop, params);
527
528 bool updated_best = this->update_best();
529
530 fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.max_gens :
531 timer.Elapsed().count()/params.max_time;
532
533 if ( params.verbosity>1 || !params.logfile.empty()) {
535 }
536
537 if(params.verbosity>1)
538 print_stats(log, fraction);
539 else if(params.verbosity == 1)
540 print_progress(fraction);
541
542 if (!params.logfile.empty())
543 log_stats(log);
544
545 if (generation == 0 || updated_best )
546 stall_count = 0;
547 else
548 ++stall_count;
549
550 ++generation;
551
552 }).name("update best, update ss, log, archive, stall");
553
554 // set-up subflow graph
555 prepare_gen.precede(run_generation);
556 run_generation.precede(update_pop);
557 update_pop.precede(finish_gen);
558 },
559
560 [&]() { return 0; }, // jump back to the next iteration
561
562 [&](tf::Subflow& subflow) {
563 // set VALIDATION loss for archive, without refitting the model
564 for (int island = 0; island < this->params.num_islands; ++island) {
565 evaluator.update_fitness(this->pop, island, data, params, false, true);
566 }
567
568 archive.update(pop, params);
569 // calculate_stats();
570
571 if (params.save_population != "")
572 this->pop.save(params.save_population);
573
574 set_is_fitted(true);
575
576 if (!params.logfile.empty()) {
577 std::ofstream log_simplification;
578 log_simplification.open(params.logfile+"simplification_table", std::ofstream::app);
579 variator.log_simplification_table(log_simplification);
580
581 log_simplification.close();
582 }
583
584 // TODO: open, write, close? (to avoid breaking the file and allow some debugging if things dont work well)
585 if (log.is_open())
586 log.close();
587
588 // getting the updated versions
589 this->ss = variator.search_space;
590 this->params = variator.parameters;
591
592 } // work done, report last gen and stop
593 ); // evolutionary loop
594
595 init.name("init");
596 cond.name("termination");
597 body.name("main loop");
598 back.name("back");
599 done.name("done");
600 taskflow.name("island_gp");
601
602 init.precede(cond);
603 cond.precede(body, done);
604 body.precede(back);
605 back.precede(cond);
606
607 executor.run(taskflow);
608 executor.wait_for_all();
609
610 //When you have tasks that are created at runtime (e.g., subflow,
611 // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph.
612}
613}
holds variable type data.
Definition data.h:51
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:170
Selection< T > selector
selection algorithm
Definition engine.h:143
Timer timer
start time of training
Definition engine.h:149
bool is_fitted
keeps track of whether fit was called
Definition engine.h:141
void calculate_stats()
Definition engine.cpp:52
Individual< T > best_ind
best individual found during training
Definition engine.h:137
void print_progress(float percentage)
Definition engine.cpp:36
void run(Dataset &d)
train the model
Definition engine.cpp:352
Log_Stats stats
runtime stats
Definition engine.h:147
Archive< T > archive
pareto front archive
Definition engine.h:138
Population< T > pop
population of programs
Definition engine.h:139
void set_population(vector< Individual< T > > pop_vector)
Definition engine.cpp:272
Selection< T > survivor
survival algorithm
Definition engine.h:145
void set_is_fitted(bool f)
set flag indicating whether fit has been called
Definition engine.h:154
vector< json > get_archive_as_json()
return archive/population as strings
Definition engine.cpp:182
void lock_nodes(int end_depth=0, bool keep_leaves_unlocked=true)
Definition engine.cpp:291
Evaluation< T > evaluator
evaluation code
Definition engine.h:144
Parameters params
hyperparameters of brush, which the user can interact
Definition engine.h:134
vector< Individual< T > > get_population()
Definition engine.cpp:229
void init()
initialize Feat object for fitting.
Definition engine.cpp:16
vector< json > get_population_as_json()
Definition engine.cpp:197
vector< Individual< T > > get_archive()
return archive/popularion as individuals ready to use
Definition engine.cpp:218
bool update_best()
updates best score by searching in the population for the individual that best fits the given data
Definition engine.cpp:306
void log_stats(std::ofstream &log)
Definition engine.cpp:119
void print_stats(std::ofstream &log, float fraction)
Definition engine.cpp:149
SearchSpace ss
Definition engine.h:135
void set_population_from_json(vector< json > pop_vector)
Definition engine.cpp:246
Class for evaluating the fitness of individuals in a population.
Definition evaluation.h:27
static std::map< std::string, float > weightsMap
set parent ids using id values
Definition individual.h:139
void set_objectives(vector< string > objs)
Definition individual.h:153
Program< T > program
executable data structure
Definition individual.h:17
Class representing the variation operators in Brush.
Definition variation.h:44
void vary_and_update(Population< T > &pop, int island, const vector< size_t > &parents, const Dataset &data, Evaluation< T > &evaluator, bool do_simplification)
Varies a population and updates the selection strategy based on rewards.
Definition variation.h:220
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
Scalar median(const T &v)
calculate median
Definition utils.h:202
static Rnd & r
Definition rnd.h:176
string PBSTR
Definition utils.cpp:13
int PBWIDTH
Definition utils.cpp:14
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
void from_json(const json &j, Fitness &f)
Definition fitness.cpp:31
void to_json(json &j, const Fitness &f)
Definition fitness.cpp:6
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
void set_search_space(const std::reference_wrapper< SearchSpace > s)
Definition program.h:86
interfaces with selection operators.
Definition selection.h:25