Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
engine.cpp
Go to the documentation of this file.
1#include "engine.h"
2
3
4#include <iostream>
5#include <fstream>
6
7
8namespace Brush{
9
10
11using namespace Pop;
12using namespace Sel;
13using namespace Eval;
14using namespace Var;
15
17template <ProgramType T>
19{
20 r.set_seed(params.get_random_state());
21
22 set_is_fitted(false);
23
24 this->pop = Population<T>();
25
26 this->evaluator = Evaluation<T>();
27
28 // TODO: make these classes have a default constructor, and stop recreating instances
29 this->variator.init(params, ss);
30
31 this->selector = Selection<T>(params.sel, false);
32 this->survivor = Selection<T>(params.surv, true);
33
34 this->best_score = MAX_FLT;
35 this->best_complexity = MAX_FLT;
36
37 this->archive.set_objectives(params.objectives);
38
39 timer.Reset();
40
41 // reset statistics
42 this->stats = Log_Stats();
43}
44
45template <ProgramType T>
47{
48 int val = (int) (percentage * 100);
49 int lpad = (int) (percentage * PBWIDTH);
50 int rpad = PBWIDTH - lpad;
51
52 printf ("\rCompleted %3d%% [%.*s%*s]", val, lpad, PBSTR.c_str(), rpad, "");
53
54 fflush (stdout);
55
56 if(val == 100)
57 cout << "\n";
58}
59
60
61template <ProgramType T>
63{
64 int pop_size = 0;
65 for (int island=0; island<params.num_islands; ++island)
66 {
67 auto indices = pop.island_indexes.at(island);
68 pop_size += indices.size();
69 }
70
71 ArrayXf scores(pop_size);
72 ArrayXf scores_v(pop_size);
73
74 // TODO: change all size_t to unsigned?
75 ArrayXi sizes(pop_size);
76 ArrayXi complexities(pop_size);
77
78 float error_weight = Individual<T>::weightsMap[params.scorer_];
79
80 int index = 0;
81 for (int island=0; island<params.num_islands; ++island)
82 {
83 auto indices = pop.island_indexes.at(island);
84 for (unsigned int i=0; i<indices.size(); ++i)
85 {
86 const auto& p = this->pop.individuals.at(indices[i]);
87
88 // Fitness class will store every information that can be used as
89 // fitness. you just need to access them. Multiplying by weight
90 // so we can find best score. From Fitness::dominates:
91 // the proper way of comparing weighted values is considering
92 // everything as a maximization problem
94 scores_v(index) = p->fitness.get_loss_v();
95 sizes(index) = p->get_size();
96 complexities(index) = p->get_complexity();
97 ++index;
98 }
99 }
100
101 assert (pop_size == this->params.pop_size);
102
103 // Multiply by weight to make it a maximization problem.
104 // Then, multiply again to get rid of signal
105 float best_score = (scores*error_weight).maxCoeff()*error_weight;
106 float best_score_v = (scores_v*error_weight).maxCoeff()*error_weight;
107 float med_score = median(scores);
108 float med_score_v = median(scores_v);
109 unsigned med_size = median(sizes);
110 unsigned med_complexity = median(complexities);
111 unsigned max_size = sizes.maxCoeff();
112 unsigned max_complexity = complexities.maxCoeff();
113
114 // update stats
115 stats.update(params.current_gen,
116 timer.Elapsed().count(),
117 best_score,
118 best_score_v,
119 med_score,
120 med_score_v,
121 med_size,
122 med_complexity,
123 max_size,
124 max_complexity);
125}
126
127
128template <ProgramType T>
129void Engine<T>::log_stats(std::ofstream& log)
130{
131 // print stats in tabular format
132 string sep = ",";
133 if (params.current_gen == 0) // print header
134 {
135 log << "generation" << sep
136 << "time" << sep
137 << "best_score" << sep
138 << "best_score_val" << sep
139 << "med_score" << sep
140 << "med_score_val" << sep
141 << "med_size" << sep
142 << "med_complexity" << sep
143 << "max_size" << sep
144 << "max_complexity" << "\n";
145 }
146 log << params.current_gen << sep
147 << timer.Elapsed().count() << sep
148 << stats.best_score.back() << sep
149 << stats.best_score_v.back() << sep
150 << stats.med_score.back() << sep
151 << stats.med_score_v.back() << sep
152 << stats.med_size.back() << sep
153 << stats.med_complexity.back() << sep
154 << stats.max_size.back() << sep
155 << stats.max_complexity.back() << "\n";
156}
157
158template <ProgramType T>
159void Engine<T>::print_stats(std::ofstream& log, float fraction)
160{
161 // progress bar
162 string bar, space = "";
163 for (unsigned int i = 0; i<50; ++i)
164 {
165 if (i <= 50*fraction) bar += "/";
166 else space += " ";
167 }
168
169 std::cout.precision(5);
170 std::cout << std::scientific;
171
172 if(params.max_time == -1)
173 std::cout << "Generation " << params.current_gen+1 << "/"
174 << params.max_gens << " [" + bar + space + "]\n";
175 else
176 std::cout << std::fixed << "Time elapsed "<< timer
177 << "/" << params.max_time
178 << " seconds (Generation "<< params.current_gen+1
179 << ") [" + bar + space + "]\n";
180
181 std::cout << std::fixed
182 << "Train Loss (Med): " << stats.best_score.back() << " (" << stats.med_score.back() << ")\n"
183 << "Val Loss (Med): " << stats.best_score_v.back() << " (" << stats.med_score_v.back() << ")\n"
184 << "Median Size (Max): " << stats.med_size.back() << " (" << stats.max_size.back() << ")\n"
185 << "Median complexity (Max): " << stats.med_complexity.back() << " (" << stats.max_complexity.back() << ")\n"
186 << "Time (s): " << timer
187 <<"\n\n";
188}
189
190template <ProgramType T>
192{
193 vector<json> archive_vector; // Use a vector to store serialized individuals
194
195 // TODO: use this front argument (or remove it). I think I can remove
196 for (const auto& ind : archive.individuals) {
197 json j; // Serialize each individual
198 to_json(j, ind);
199 archive_vector.push_back(j);
200 }
201
202 return archive_vector;
203}
204
205// TODO: private function called find_individual that searches for it based on id. Then,
206// use this function in predict_archive and predict_proba_archive.
207template <ProgramType T>
208auto Engine<T>::predict_archive(int id, const Dataset& data)
209{
210 if (id == best_ind.id)
211 return best_ind.predict(data);
212
213 for (int i = 0; i < this->archive.individuals.size(); ++i)
214 {
215 Individual<T>& ind = this->archive.individuals.at(i);
216
217 if (id == ind.id)
218 return ind.predict(data);
219 }
220 for (int island=0; island<pop.num_islands; ++island) {
221 auto indices = pop.get_island_indexes(island);
222
223 for (unsigned i = 0; i<indices.size(); ++i)
224 {
225 const auto& ind = pop.individuals.at(indices.at(i));
226
227 if (id == ind->id)
228 return ind->predict(data);
229 }
230 }
231
232 std::runtime_error("Could not find id = "
233 + to_string(id) + "in archive or population.");
234
235 return best_ind.predict(data);
236}
237
238template <ProgramType T>
240{
241 Dataset d(X);
242 return predict_archive(id, d);
243}
244
245template <ProgramType T>
246template <ProgramType P>
247 requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier))
249{
250 if (id == best_ind.id)
251 return best_ind.predict_proba(data);
252
253 for (int i = 0; i < this->archive.individuals.size(); ++i)
254 {
255 Individual<T>& ind = this->archive.individuals.at(i);
256
257 if (id == ind.id)
258 return ind.predict_proba(data);
259 }
260 for (int island=0; island<pop.num_islands; ++island) {
261 auto indices = pop.get_island_indexes(island);
262
263 for (unsigned i = 0; i<indices.size(); ++i)
264 {
265 const auto& ind = pop.individuals.at(indices.at(i));
266
267 if (id == ind->id)
268 return ind->predict_proba(data);
269 }
270 }
271
272 std::runtime_error("Could not find id = "
273 + to_string(id) + "in archive or population.");
274
275 return best_ind.predict_proba(data);
276}
277
278template <ProgramType T>
279template <ProgramType P>
280 requires((P == PT::BinaryClassifier) || (P == PT::MulticlassClassifier))
282{
283 Dataset d(X);
284 return predict_proba_archive(id, d);
285}
286
287template <ProgramType T>
288bool Engine<T>::update_best(const Dataset& data, bool val)
289{
290 float error_weight = Individual<T>::weightsMap[params.scorer_];
291
292 float f;
293 bool updated = false;
294 float bs = this->best_score;
295
296 vector<size_t> hof = this->pop.hall_of_fame(1);
297
298 for (int i=0; i < hof.size(); ++i)
299 {
300 const auto& ind = *pop.individuals.at(hof[i]);
301
302 // TODO: dataset arg here with null default value. if the user provides a dataset, we use it to update
303 // if there is no validation, then loss_v==loss and this should work just fine
305
307 || (f == bs && ind.fitness.complexity < this->best_complexity) )
308 {
309 bs = f;
310 this->best_ind = ind;
311 this->best_complexity = ind.fitness.complexity;
312
313 updated = true;
314 }
315 }
316
317 this->best_score = bs;
318
319 return updated;
320}
321
322
323template <ProgramType T>
325{
326 //TODO: i need to make sure i initialize everything (pybind needs to have constructors
327 // without arguments to work, and i need to handle correcting these values before running)
328 this->ss = SearchSpace(data, params.functions);
329
330 this->init();
331
332 if (params.load_population != "")
333 this->pop.load(params.load_population);
334 else
335 this->pop.init(this->ss, this->params);
336
337 // log file stream
338 std::ofstream log;
339 if (!params.logfile.empty())
340 log.open(params.logfile, std::ofstream::app);
341
342 evaluator.set_scorer(params.scorer_);
343
344 Dataset &batch = data;
345
346 int threads;
347 if (params.n_jobs == -1)
348 threads = std::thread::hardware_concurrency();
349 else if (params.n_jobs == 0)
350 threads = params.num_islands;
351 else
352 threads = params.n_jobs;
353
354 tf::Executor executor(threads);
355
356 assert( (executor.num_workers() > 0) && "Invalid number of workers");
357
358 tf::Taskflow taskflow;
359
360 // stop criteria
361 unsigned generation = 0;
362 unsigned stall_count = 0;
363 float fraction = 0;
364
365 bool use_arch;
366
367 auto stop = [&]() {
368 return ( (generation == params.max_gens)
369 && ((params.max_stall == 0 || stall_count < params.max_stall)
370 && (params.max_time == -1 || params.max_time > timer.Elapsed().count()) )
371 );
372 };
373
374 // TODO: check that I dont use pop.size() (or I use correctly, because it will return the size with the slots for the offspring)
375 // vectors to store each island separatedly
376 vector<vector<size_t>> island_parents;
377 vector<vector<size_t>> survivors;
378 island_parents.clear();
379 island_parents.resize(pop.num_islands);
380
381 survivors.clear();
382 survivors.resize(pop.num_islands);
383
384 for (int i=0; i< params.num_islands; i++){
385 size_t idx_start = std::floor(i*params.pop_size/params.num_islands);
386 size_t idx_end = std::floor((i+1)*params.pop_size/params.num_islands);
387
388 auto delta = idx_end - idx_start;
389
390 survivors.at(i).clear();
391 island_parents.at(i).clear();
392
393 survivors.at(i).resize(delta);
394 island_parents.at(i).resize(delta);
395 }
396
397 // heavily inspired in https://github.com/heal-research/operon/blob/main/source/algorithms/nsga2.cpp
398 auto [init, cond, body, back, done] = taskflow.emplace(
399 [&]() { /* done nothing to do */ }, // init (entry point for taskflow)
400
401 stop, // loop condition
402
403 [&](tf::Subflow& subflow) { // loop body (evolutionary main loop)
404 auto prepare_gen = subflow.emplace([&]() {
405 params.set_current_gen(generation);
406 batch = data.get_batch(); // will return the original dataset if it is set to dont use batch
407 }).name("prepare generation");// set generation in params, get batch
408
409 auto run_generation = subflow.for_each_index(0, this->params.num_islands, 1, [&](int island) {
410 evaluator.update_fitness(this->pop, island, data, params, true); // fit the weights with all training data
411
412 // TODO: have some way to set which fitness to use (for example in params, or it can infer based on split size idk)
413 // TODO: if using batch, fitness should be called before selection to set the batch
414 if (data.use_batch) // assign the batch error as fitness (but fit was done with training data)
415 evaluator.update_fitness(this->pop, island, batch, params, false);
416
417 vector<size_t> parents = selector.select(this->pop, island, params);
418
419 for (int i=0; i< parents.size(); i++){
420 island_parents.at(island).at(i) = parents.at(i);
421 }
422
423 this->pop.add_offspring_indexes(island);
424 variator.vary(this->pop, island, island_parents.at(island));
425 evaluator.update_fitness(this->pop, island, data, params, true);
426
427 if (data.use_batch) // assign the batch error as fitness (but fit was done with training data)
428 evaluator.update_fitness(this->pop, island, batch, params, false);
429
430 // select survivors from combined pool of parents and offspring
431 vector<size_t> island_survivors = survivor.survive(this->pop, island, params);
432
433 for (int i=0; i< island_survivors.size(); i++){
434 survivors.at(island).at(i) = island_survivors.at(i);
435 }
436 }).name("runs one generation at each island in parallel");
437
438 auto update_pop = subflow.emplace([&]() {
439 this->pop.update(survivors);
440 this->pop.migrate();
441 }).name("update, migrate and disentangle indexes between islands");
442
443 auto finish_gen = subflow.emplace([&]() {
444 bool updated_best = this->update_best(data);
445
446 if ( (params.verbosity>1 || !params.logfile.empty() )
447 || params.use_arch ) {
448 calculate_stats();
449 }
450
451 if (params.use_arch)
452 archive.update(pop, params);
453
454 fraction = params.max_time == -1 ? ((generation+1)*1.0)/params.max_gens :
455 timer.Elapsed().count()/params.max_time;
456
457 if(params.verbosity>1)
458 print_stats(log, fraction);
459 else if(params.verbosity == 1)
460 print_progress(fraction);
461
462 if (!params.logfile.empty())
463 log_stats(log);
464
465 if (generation == 0 || updated_best )
466 stall_count = 0;
467 else
468 ++stall_count;
469
470 ++generation;
471
472 }).name("update best, log, archive, stall");
473
474 // set-up subflow graph
476 run_generation.precede(update_pop);
477 update_pop.precede(finish_gen);
478 },
479
480 [&]() { return 0; }, // jump back to the next iteration
481
482 [&]() {
483 if (params.save_population != "")
484 this->pop.save(params.save_population);
485
486 this->set_is_fitted(true);
487
488 // TODO: open, write, close? (to avoid breaking the file and allow some debugging if things dont work well)
489 if (log.is_open())
490 log.close();
491
492 // if we're not using an archive, let's store the final population in the
493 // archive
494 if (!params.use_arch)
495 {
496 archive.individuals.resize(0);
497 for (int island =0; island< pop.num_islands; ++island) {
498 vector<size_t> indices = pop.get_island_indexes(island);
499
500 for (unsigned i = 0; i<indices.size(); ++i)
501 {
502 archive.individuals.push_back( *pop.individuals.at(indices.at(i)) );
503 }
504 }
505 }
506
507 } // work done, report last gen and stop
508 ); // evolutionary loop
509
510 init.name("init");
511 cond.name("termination");
512 body.name("main loop");
513 back.name("back");
514 done.name("done");
515 taskflow.name("island_gp");
516
517 init.precede(cond);
518 cond.precede(body, done);
519 body.precede(back);
520 back.precede(cond);
521
522 executor.run(taskflow);
523 executor.wait_for_all();
524
525 //When you have tasks that are created at runtime (e.g., subflow,
526 // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph.
527}
528}
void bind_engine(py::module &m, string name)
holds variable type data.
Definition data.h:51
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:146
The Engine class represents the core engine of the brush library.
Definition engine.h:43
void calculate_stats()
Definition engine.cpp:62
auto predict_proba(const Dataset &d)
Definition engine.h:96
void print_progress(float percentage)
Definition engine.cpp:46
void run(Dataset &d)
train the model
Definition engine.cpp:324
bool update_best(const Dataset &data, bool val=false)
updates best score by searching in the population for the individual that best fits the given data
Definition engine.cpp:288
void init()
initialize Feat object for fitting.
Definition engine.cpp:18
auto predict_archive(int id, const Dataset &data)
predict on unseen data from the archive
Definition engine.cpp:208
void log_stats(std::ofstream &log)
Definition engine.cpp:129
vector< json > get_archive(bool front)
return population as string
Definition engine.cpp:191
void print_stats(std::ofstream &log, float fraction)
Definition engine.cpp:159
auto predict(const Dataset &data)
Definition individual.h:66
auto predict_proba(const Dataset &d)
Definition individual.h:75
Fitness fitness
aggregate fitness score
Definition individual.h:33
void set_seed(unsigned int seed)
Definition rnd.cpp:46
static float MAX_FLT
Definition init.h:61
Scalar median(const T &v)
calculate median
Definition utils.h:202
string to_string(const T &value)
template function to convert objects to string for logging
Definition utils.h:369
static Rnd & r
Definition rnd.h:174
string PBSTR
Definition utils.cpp:13
int PBWIDTH
Definition utils.cpp:14
< nsga2 selection operator for getting the front
Definition data.cpp:12
void to_json(json &j, const Fitness &f)
Definition fitness.cpp:6
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
Namespace containing scoring functions for evaluation metrics.
float get_loss() const
Definition fitness.h:51
float loss_v
aggregate validation loss score
Definition fitness.h:29
Holds a search space, consisting of operations and terminals and functions, and methods to sample tha...