Feat C++ API
A feature engineering automation tool
feat.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "feat.h"
7 
8 //shogun initialization
9 void __attribute__ ((constructor)) ctor()
10 {
11  init_shogun_with_defaults();
12 }
13 
14 void __attribute__ ((destructor)) dtor()
15 {
16  exit_shogun();
19 }
20 
21 using namespace FT;
22 
24 void Feat::init()
25 {
26  if (params.n_jobs!=0)
29 
30  if (GPU)
32  // set Feat's Normalizer to only normalize floats by default
33  this->N = Normalizer(false);
35  set_is_fitted(false);
36 
37  // start the clock
38  timer.Reset();
39  // signal handler
40  signal(SIGINT, my_handler);
41  // reset statistics
42  this->stats = Log_Stats();
44 }
45 
46 void Feat::fit(MatrixXf& X, VectorXf& y, LongData& Z)
47 {
48 
68  this->init();
69  std::ofstream log;
70  if (!logfile.empty())
71  log.open(logfile, std::ofstream::app);
72  params.init(X, y);
73 
74  string FEAT;
75  if (params.verbosity == 1)
76  {
77  FEAT = (
78  "/// Feature Engineering Automation Tool "
79  "* \xc2\xa9 La Cava et al 2017 "
80  "* GPL3 \\\\\\\n"
81  );
82  }
83  else if (params.verbosity == 2)
84  {
85  FEAT = (
86  "/////////////////////////////////////////////////////////////////////\n"
87  "// * Feature Engineering Automation Tool * //\n"
88  "// La Cava et al. 2017 //\n"
89  "// License: GPL v3 //\n"
90  "// https://cavalab.org/feat //\n"
91  "/////////////////////////////////////////////////////////////////////\n"
92  );
93  }
94 
95  if (params.use_batch)
96  {
97  if (params.bp.batch_size >= X.cols())
98  {
99  logger.log("turning off batch because X has fewer than "
100  + to_string(params.bp.batch_size) + " samples", 1);
101  params.use_batch = false;
102  }
103  else
104  {
105  logger.log("using batch with batch_size= "
106  + to_string(params.bp.batch_size), 2);
107  }
108  }
109 
110  // if(str_dim.compare("") != 0)
111  // {
112  // string dimension;
113  // dimension = str_dim.substr(0, str_dim.length() - 1);
114  // logger.log("STR DIM IS "+ dimension, 2);
115  // logger.log("Cols are " + std::to_string(X.rows()), 2);
116  // logger.log("Setting dimensionality as " +
117  // std::to_string((int)(ceil(stod(dimension)*X.rows()))), 2);
118  // set_max_dim(ceil(stod(dimension)*X.rows()));
119  // }
120 
121 
122  logger.log(FEAT,1);
123 
125 
126  // normalize data
127  if (params.normalize)
128  {
130  }
131  this->pop = Population(params.pop_size);
133 
134  /* create an archive to save Pareto front,
135  * unless NSGA-2 is being used for survival
136  */
137  /* if (!survival.compare("nsga2")) */
138  /* use_arch = false; */
139  /* else */
140  /* use_arch = true; */
141  use_arch = false;
142 
143  logger.log("scorer: " + params.scorer_, 1);
144 
145  // split data into training and test sets
146  //Data data(X, y, Z, params.classification);
148  //DataRef d;
149  //d.setOriginalData(&data);
151  // define terminals based on size of X
152  params.set_terminals(d.o->X.rows(), d.o->Z);
153 
154  // initial model on raw input
155  logger.log("Setting up data", 2);
156  float t0 = timer.Elapsed().count();
157 
158  //data for batch training
159  MatrixXf Xb;
160  VectorXf yb;
161  LongData Zb;
163 
164  Data *tmp_train;
165 
166  if(params.use_batch)
167  {
168  tmp_train = d.t;
169  d.t->get_batch(db, params.bp.batch_size);
170  d.setTrainingData(&db);
171  }
172 
173  if (params.classification)
175 
176 
177  // initialize population
179  logger.log("Initializing population", 2);
180 
181  bool random = selector.get_type() == "random";
182 
183  // initial model
185  logger.log("Fitting initial model", 2);
186  t0 = timer.Elapsed().count();
187  initial_model(d);
188  logger.log("Initial fitting took "
189  + std::to_string(timer.Elapsed().count() - t0) + " seconds",2);
190 
191  // initialize population with initial model and/or starting pop
192  pop.init(best_ind,params,random, this->starting_pop);
193  logger.log("Initial population:\n"+pop.print_eqns(),3);
194 
195  // evaluate initial population
196  logger.log("Evaluating initial population",2);
199 
200  logger.log("Initial population done",2);
201  logger.log(std::to_string(timer.Elapsed().count()) + " seconds",2);
202 
203  vector<size_t> survivors;
204 
205  if(params.use_batch) // reset d to all training data
206  d.setTrainingData(tmp_train, true);
207 
208  // =====================
209  // main generational loop
210  unsigned g = 0;
211  unsigned stall_count = 0;
212  float fraction = 0;
213  // continue until max gens is reached or max_time is up (if it is set)
214 
215  while(
216  // time limit
217  (params.max_time == -1 || params.max_time > timer.Elapsed().count())
218  // generation limit
219  && g<params.gens
220  // stall limit
221  && (params.max_stall == 0 || stall_count < params.max_stall)
222  )
223  {
224  fraction = params.max_time == -1 ? ((g+1)*1.0)/params.gens :
225  timer.Elapsed().count()/params.max_time;
226  if(params.use_batch)
227  {
228  d.t->get_batch(db, params.bp.batch_size);
229  DataRef dbr; // reference to minibatch data
230  dbr.setTrainingData(&db);
231  dbr.setValidationData(d.v);
232 
234  params.set_sample_weights(dbr.t->y);
235 
236  run_generation(g, survivors, dbr, log, fraction, stall_count);
237  }
238  else
239  {
240  run_generation(g, survivors, d, log, fraction, stall_count);
241  }
242 
243  g++;
244  }
245  // =====================
246  if ( params.max_stall != 0 && stall_count >= params.max_stall)
247  logger.log("learning stalled",2);
248  else if ( g >= params.gens)
249  logger.log("generation limit reached",2);
250  else
251  logger.log("max time reached",2);
252 
253  logger.log("train score: " + std::to_string(this->min_loss), 2);
254  logger.log("validation score: " + std::to_string(min_loss_v), 2);
255  logger.log("fitting final model to all training data...",2);
256 
257 
258  // simplify the final model
259  if (simplify > 0.0)
260  {
261  this->best_ind.fit(*d.o, params);
262  simplify_model(d, this->best_ind);
263  }
264 
265  // fit final model to best features
266  final_model(d);
267 
268  // if we're not using an archive, let's store the final population in the
269  // archive
270  if (!use_arch)
271  {
273  }
274 
275  if (save_pop > 0)
276  {
277  pop.save(this->logfile+".pop.gen" + to_string(params.current_gen)
278  + ".json");
279  this->best_ind.save(this->logfile+".best.json");
280  }
281 
282  if (log.is_open())
283  log.close();
284 
285  set_is_fitted(true);
286  logger.log("Run Completed. Total time taken is "
287  + std::to_string(timer.Elapsed().count()) + " seconds", 1);
288  logger.log("best model: " + this->get_eqn(),1);
289  logger.log("tabular model:\n" + this->get_model(),2);
290  logger.log("/// ----------------------------------------------------------------- \\\\\\",
291  1);
292 
293 }
295 void Feat::set_pop_size(int pop_size){ params.pop_size = pop_size; }
296 
298 void Feat::set_gens(int gens){ params.gens = gens;}
299 
301 void Feat::set_ml(string ml){ params.ml = ml; }
302 
304 void Feat::set_classification(bool classification)
305 {
306  params.classification = classification;
307 }
308 
310 void Feat::set_verbosity(int verbosity){ params.set_verbosity(verbosity); }
311 
313 void Feat::set_max_stall(int max_stall){ params.max_stall = max_stall; }
314 
316 void Feat::set_selection(string sel){ this->selector = Selection(sel, false); }
317 
319 void Feat::set_survival(string surv)
320 {
321  survival=surv;
322  survivor = Selection(surv, true);
323 }
324 
326 void Feat::set_cross_rate(float cross_rate)
327 {
328  params.cross_rate = cross_rate;
329  variator.set_cross_rate(cross_rate);
330 }
331 
333 void Feat::set_root_xo_rate(float cross_rate)
334 {
335  params.root_xo_rate = cross_rate;
336 }
337 
339 void Feat::set_otype(char ot){ params.set_otype(ot); }
340 
341 
343 void Feat::set_max_depth(unsigned int max_depth)
344 {
345  params.set_max_depth(max_depth);
346 }
347 
349 void Feat::set_max_dim(unsigned int max_dim){ params.set_max_dim(max_dim); }
350 
352 // void Feat::set_max_dim(string str){ str_dim = str; }
353 
356 {
357  params.random_state=rs;
358  r.set_seed(rs);
359 }
360 
362 void Feat::set_erc(bool erc){ params.erc = erc; }
363 
366 
368 void Feat::set_split(float sp){params.split = sp;}
369 
371 void Feat::set_dtypes(vector<char> dtypes){params.dtypes = dtypes;}
372 
374 void Feat::set_fb(float fb){ params.feedback = fb;}
375 
377 void Feat::set_logfile(string s){logfile = s;}
378 
380 void Feat::set_scorer(string s){params.set_scorer(s);}
382 string Feat::get_scorer(){return params.scorer;}
383 
386 
387 void Feat::set_simplify(float s){this->simplify=s;}
388 
390 
392 
393 void Feat::set_iters(int iters){params.bp.iters = iters; params.hc.iters=iters;}
394 
395 void Feat::set_lr(float lr){params.bp.learning_rate = lr;}
396 
398 {
399  params.bp.batch_size = bs;
400  params.use_batch = bs>0;
401 }
402 
404 void Feat::set_n_jobs(unsigned t){ omp_set_num_threads(t); }
405 
406 void Feat::set_max_time(int time){ params.max_time = time; }
407 
409 
411 {
413 }
414 /*
415  * getting functions
416  */
417 
420 
422 int Feat::get_gens(){ return params.gens; }
423 
425 string Feat::get_ml(){ return params.ml; }
426 
429 
432 
434 vector<char> Feat::get_otypes(){ return params.otypes; }
435 
438 
441 
444 
447 
450 
452 bool Feat::get_erc(){ return params.erc; }
453 
455 string Feat::get_logfile(){ return logfile; }
456 
459 
462 
464 float Feat::get_split(){ return params.split; }
465 
467 /* void add_function(unique_ptr<Node> N){ params.functions.push_back(N->clone()); } */
468 
470 vector<char> Feat::get_dtypes(){ return params.dtypes; }
471 
473 float Feat::get_fb(){ return params.feedback; }
474 
477 
478 string Feat::get_eqn(bool sort){ return this->get_ind_eqn(sort, this->best_ind); };
479 
480 string Feat::get_ind_eqn(bool sort, Individual& ind)
481 {
482  vector<string> features = ind.get_features();
483  vector<float> weights = ind.ml->get_weights();
484  float offset = ind.ml->get_bias();
485 
486  /* if (params.normalize) */
487  /* { */
488  /* offset = this->N.adjust_offset(weights, offset); */
489  /* this->N.adjust_weights(weights); */
490  /* } */
491 
492  vector<size_t> order(weights.size());
493  if (sort)
494  {
495  vector<float> aweights(weights.size());
496  for (int i =0; i<aweights.size(); ++i)
497  aweights[i] = fabs(weights[i]);
498  order = argsort(aweights, false);
499  }
500  else
501  iota(order.begin(), order.end(), 0);
502 
503  string output;
504  output += to_string(offset);
505  if (weights.size() > 0)
506  {
507  if (weights.at(order.at(0)) > 0)
508  output += "+";
509  }
510  int i = 0;
511  for (const auto& o : order)
512  {
513  output += to_string(weights.at(o), 2);
514  output += "*";
515  output += features.at(o);
516  if (i < order.size()-1)
517  {
518  if (weights.at(order.at(i+1)) > 0)
519  output+= "+";
520  }
521  ++i;
522  }
523 
524  return output;
525 }
526 
527 string Feat::get_model(bool sort)
528 {
529  vector<string> features = best_ind.get_features();
530  vector<float> weights = best_ind.ml->get_weights();
531  float offset = best_ind.ml->get_bias();
532  /* if (params.normalize) */
533  /* { */
534  /* offset = this->N.adjust_offset(weights, offset); */
535  /* this->N.adjust_weights(weights); */
536  /* } */
537 
538  vector<size_t> order(weights.size());
539  if (sort)
540  {
541  vector<float> aweights(weights.size());
542  for (int i =0; i<aweights.size(); ++i)
543  aweights[i] = fabs(weights[i]);
544  order = argsort(aweights, false);
545  }
546  else
547  iota(order.begin(), order.end(), 0);
548 
549  string output;
550  output += "Weight\tFeature\n";
551  output += to_string(offset) + "\toffset" + "\n";
552  for (const auto& o : order)
553  {
554  output += to_string(weights.at(o), 2);
555  output += "\t";
556  output += features.at(o);
557  output += "\n";
558  }
559 
560  return output;
561 }
562 
565 
567 int Feat::get_dim(){ return best_ind.get_dim(); }
568 
571 
572 
574 int Feat::get_n_nodes(){ return best_ind.program.size(); }
575 
577 vector<json> Feat::get_archive(bool front)
578 {
579  /* TODO: maybe this should just return the to_json call of
580  * the underlying population / archive. I guess the problem
581  * is that we don't have to_json defined for vector<Individual>.
582  */
583  vector<Individual>* printed_pop = NULL;
584 
585  string r = "";
586 
587  vector<size_t> idx;
588  bool subset = false;
589  if (front) // only return individuals on the Pareto front
590  {
591  if (use_arch)
592  {
593  printed_pop = &archive.individuals;
594  }
595  else
596  {
597  unsigned n = 1;
598  subset = true;
599  idx = this->pop.sorted_front(n);
600  printed_pop = &this->pop.individuals;
601  }
602  }
603  else
604  printed_pop = &this->pop.individuals;
605 
606  if (!subset)
607  {
608  idx.resize(printed_pop->size());
609  std::iota(idx.begin(), idx.end(), 0);
610  }
611 
612  bool includes_best_ind = false;
613 
614  vector<json> json_archive;
615 
616  for (int i = 0; i < idx.size(); ++i)
617  {
618  Individual& ind = printed_pop->at(idx[i]);
619 
620  json j;
621  to_json(j, ind);
622 
623  // r += j.dump();
624  json_archive.push_back(j);
625 
626  if (i < idx.size() -1)
627  r += "\n";
628  // check if best_ind is in here
629  if (ind.id == best_ind.id)
630  includes_best_ind = true;
631  }
632 
633  // add best_ind, if it is not included
634  if (!includes_best_ind)
635  {
636  json j;
637  to_json(j, best_ind);
638  json_archive.push_back(j);
639  }
640 
641  // delete pop pointer
642  printed_pop = NULL;
643  delete printed_pop;
644 
645  return json_archive;
646 }
647 
650 {
651  auto tmpw = best_ind.ml->get_weights();
652  ArrayXf w = ArrayXf::Map(tmpw.data(), tmpw.size());
653  return w;
654 }
655 
657 std::map<string, std::pair<vector<ArrayXf>, vector<ArrayXf>>> Feat::get_Z(string s,
658  int * idx, int idx_size)
659 {
660  LongData Z;
661  vector<int> ids(idx,idx+idx_size);
662  load_partial_longitudinal(s,Z,',',ids);
663 
664  return Z;
665 }
666 
667 
668 void Feat::fit(MatrixXf& X, VectorXf& y)
669 {
670  auto Z = LongData();
671  fit(X,y,Z);
672 }
673 
674 
675 void Feat::run_generation(unsigned int g,
676  vector<size_t> survivors,
677  DataRef &d,
678  std::ofstream &log,
679  float fraction,
680  unsigned& stall_count)
681 {
682  d.t->set_protected_groups();
683 
685 
686  // select parents
687  logger.log("selection..", 2);
688  vector<size_t> parents = selector.select(pop, params, *d.t);
689  logger.log("parents:\n"+pop.print_eqns(), 3);
690 
691  // variation to produce offspring
692  logger.log("variation...", 2);
693  variator.vary(pop, parents, params,*d.t);
694  logger.log("offspring:\n" + pop.print_eqns(true), 3);
695 
696  // evaluate offspring
697  logger.log("evaluating offspring...", 2);
698  evaluator.fitness(pop.individuals, *d.t, params, true);
700 
701  // select survivors from combined pool of parents and offspring
702  logger.log("survival...", 2);
703  survivors = survivor.survive(pop, params, *d.t);
704 
705  // reduce population to survivors
706  logger.log("shrinking pop to survivors...",2);
707  pop.update(survivors);
708  logger.log("survivors:\n" + pop.print_eqns(), 3);
709 
710  logger.log("update best...",2);
711  bool updated_best = update_best(d);
712 
713  logger.log("calculate stats...",2);
714  calculate_stats(d);
715 
716  if (params.max_stall > 0)
717  update_stall_count(stall_count, updated_best);
718 
719  if ( (use_arch || params.verbosity>1) || !logfile.empty()) {
720  // set objectives to make sure they are reported in log/verbose/arch
721  #pragma omp parallel for
722  for (unsigned int i=0; i<pop.size(); ++i)
723  pop.individuals.at(i).set_obj(params.objectives);
724  }
725 
726  logger.log("update archive...",2);
727  if (use_arch)
729 
730  if(params.verbosity>1)
731  print_stats(log, fraction);
732  else if(params.verbosity == 1)
733  printProgress(fraction);
734 
735  if (!logfile.empty())
736  log_stats(log);
737 
738  if (save_pop > 1)
739  pop.save(this->logfile+".pop.gen" +
740  to_string(params.current_gen) + ".json");
741 
742  // tighten learning rate for grad descent as evolution progresses
743  if (params.backprop)
744  {
746  (1-1/(1+float(params.gens)))*params.bp.learning_rate;
747  logger.log("learning rate: "
749  }
750  logger.log("finished with generation...",2);
751 
752 }
753 
754 void Feat::update_stall_count(unsigned& stall_count, bool best_updated)
755 {
756  if (params.current_gen == 0 || best_updated )
757  {
758  /* best_med_score = this->med_loss_v; */
759  stall_count = 0;
760  }
761  else
762  {
763  ++stall_count;
764  }
765 
766  logger.log("stall count: " + std::to_string(stall_count), 2);
767 }
768 
769 
771 {
772  // fits final model to best tranformation found.
773  shared_ptr<CLabels> yhat;
774  if (params.tune_final)
775  yhat = best_ind.fit_tune(*d.o, params);
776  else
777  yhat = best_ind.fit(*d.o, params);
778 
779  VectorXf tmp;
780  /* params.set_sample_weights(y); // need to set new sample weights for y, */
781  // which is probably from a validation set
782  float score = evaluator.S.score(d.o->y,yhat,tmp,params.class_weights);
783  logger.log("final_model score: " + std::to_string(score),2);
784 }
785 
787 {
788  /* Simplifies the final model using some expert rules and stochastic hill
789  * climbing.
790  * Expert rules:
791  * - NOT(NOT(x)) simplifies to x
792  * Stochastic hill climbing:
793  * for some number iterations, apply delete mutation to the equation.
794  * if the output of the model doesn't change, keep the mutations.
795  */
796 
798  // check for specific patterns
800  //
801  Individual tmp_ind = ind;
802  int starting_size = ind.size();
803  vector<size_t> roots = tmp_ind.program.roots();
804  vector<size_t> idx_to_remove;
805 
806  logger.log("\n=========\ndoing pattern pruning...",2);
807  logger.log("simplify: " + to_string(this->simplify), 2);
808 
809  for (auto r : roots)
810  {
811  size_t start = tmp_ind.program.subtree(r);
812  int first_occurence = -2;
813 
814  /* cout << "start: " << start << "\n"; */
815  for (int i = start ; i <= r; ++i)
816  {
817  /* cout << "i: " << i << ", first_occurence: " << first_occurence */
818  /* << "\n"; */
819  if (tmp_ind.program.at(i)->name.compare("not")==0)
820  {
821  if (first_occurence == i-1) // indicates two NOTs in a row
822  {
823  /* cout << "pushing back " << first_occurence */
824  /* << " and " << i << " to idx_to_remove\n"; */
825  idx_to_remove.push_back(first_occurence);
826  idx_to_remove.push_back(i);
827  // reset first_occurence so we don't pick up triple nots
828  first_occurence = -2;
829  }
830  else
831  {
832  first_occurence = i;
833  }
834  }
835  }
836  }
837  // remove indices in reverse order so they don't change
838  std::reverse(idx_to_remove.begin(), idx_to_remove.end());
839  for (auto idx: idx_to_remove)
840  {
841  /* cout << "removing " << tmp_ind.program.at(idx)->name */
842  /* << " at " << idx << "\n"; */
843  tmp_ind.program.erase(tmp_ind.program.begin()+idx);
844  }
845  int end_size = tmp_ind.size();
846  logger.log("pattern pruning reduced best model size by "
847  + to_string(starting_size - end_size)
848  + " nodes\n=========\n", 2);
849  if (tmp_ind.size() < ind.size())
850  {
851  ind = tmp_ind;
852  logger.log("new model:" + this->get_ind_eqn(false, ind),2);
853  }
854 
856  // prune dimensions
858  /* set_verbosity(3); */
859  int iterations = ind.get_dim();
860  logger.log("\n=========\ndoing correlation deletion mutations...",2);
861  starting_size = ind.size();
862  VectorXf original_yhat;
864  original_yhat = ind.predict_proba(*d.o).row(0);
865  else
866  original_yhat = ind.yhat;
867 
868  for (int i = 0; i < iterations; ++i)
869  {
870  Individual tmp_ind = ind;
871  bool perfect_correlation = variator.correlation_delete_mutate(
872  tmp_ind, ind.Phi, params, *d.o);
873 
874  if (ind.size() == tmp_ind.size())
875  {
876  continue;
877  }
878 
879  tmp_ind.fit(*d.o, params);
880 
881  VectorXf new_yhat;
883  new_yhat = tmp_ind.predict_proba(*d.o).row(0);
884  else
885  new_yhat = tmp_ind.yhat;
886 
887 
888  if (((original_yhat - new_yhat).norm()/original_yhat.norm()
889  <= this->simplify )
890  or perfect_correlation)
891  {
892  logger.log("\ndelete dimension mutation success: went from "
893  + to_string(ind.size()) + " to "
894  + to_string(tmp_ind.size()) + " nodes. Output changed by "
895  + to_string(100*(original_yhat
896  -new_yhat).norm()/(original_yhat.norm()))
897  + " %", 2);
898  if (perfect_correlation)
899  logger.log("perfect correlation",2);
900  ind = tmp_ind;
901  }
902  else
903  {
904  logger.log("\ndelete dimension mutation failure. Output changed by "
905  + to_string(100*(original_yhat
906  -new_yhat).norm()/(original_yhat.norm()))
907  + " %", 2);
908  // if this mutation fails, it will continue to fail since it
909  // is deterministic. so, break in this case.
910  break;
911  }
912 
913  }
914  end_size = ind.size();
915  logger.log("correlation pruning reduced best model size by "
916  + to_string(starting_size - end_size)
917  + " nodes\n=========\n", 2);
918  if (end_size < starting_size)
919  logger.log("new model:" + this->get_ind_eqn(false, ind),2);
920 
922  // prune subtrees
924  iterations = 1000;
925  logger.log("\n=========\ndoing subtree deletion mutations...", 2);
926  starting_size = ind.size();
927  for (int i = 0; i < iterations; ++i)
928  {
929  Individual tmp_ind = ind;
930  this->variator.delete_mutate(tmp_ind, params);
931  if (ind.size() == tmp_ind.size())
932  continue;
933 
934  tmp_ind.fit(*d.o, params);
935 
936  VectorXf new_yhat;
938  new_yhat = tmp_ind.predict_proba(*d.o).row(0);
939  else
940  new_yhat = tmp_ind.yhat;
941 
942  if ((original_yhat - new_yhat).norm()/original_yhat.norm()
943  <= this->simplify )
944  {
945  logger.log("\ndelete mutation success: went from "
946  + to_string(ind.size()) + " to "
947  + to_string(tmp_ind.size()) + " nodes. Output changed by "
948  + to_string(100*(original_yhat
949  -new_yhat).norm()/(original_yhat.norm()))
950  + " %", 2);
951  ind = tmp_ind;
952  }
953  else
954  {
955  logger.log("\ndelete mutation failure. Output changed by "
956  + to_string(100*(original_yhat
957  -new_yhat).norm()/(original_yhat.norm()))
958  + " %", 2);
959  // if this mutation fails, it will continue to fail since it
960  // is deterministic. so, break in this case.
961  break;
962  }
963 
964  }
965  end_size = ind.size();
966  logger.log("subtree deletion reduced best model size by "
967  + to_string( starting_size - end_size )
968  + " nodes", 2);
969  VectorXf new_yhat;
971  new_yhat = ind.predict_proba(*d.o).row(0);
972  else
973  new_yhat = ind.yhat;
974  VectorXf difference = new_yhat - original_yhat;
975  /* cout << "final % difference: " << difference.norm()/original_yhat.norm() */
976  /* << endl; */
977 }
978 
979 vector<float> Feat::univariate_initial_model(DataRef &d, int n_feats)
980 {
992  vector<float> univariate_weights(d.t->X.rows() + d.t->Z.size(),0.0);
993  int N = d.t->X.cols();
994 
995  MatrixXf predictor(1,N);
996  string ml_type = this->params.classification?
997  "LR" : "LinearRidgeRegression";
998 
1000 
1001  bool pass = true;
1002 
1003  logger.log("univariate_initial_model",2);
1004  logger.log("N: " + to_string(N),2);
1005  logger.log("n_feats: " + to_string(n_feats),2);
1006 
1007  for (unsigned i =0; i<d.t->X.rows(); ++i)
1008  {
1009  predictor.row(0) = d.t->X.row(i);
1010  /* float b = (covariance(predictor,d.t->y) / */
1011  /* variance(predictor)); */
1012  pass = true;
1013  shared_ptr<CLabels> yhat = ml.fit(predictor, d.t->y, this->params,
1014  pass);
1015  if (pass)
1016  univariate_weights.at(i) = ml.get_weights().at(0);
1017  else
1018  univariate_weights.at(i) = 0;
1019  }
1020  int j = d.t->X.rows();
1021  for (const auto& val: d.t->Z)
1022  {
1023  for (int k = 0; k<N; ++k)
1024  predictor(k) = median(val.second.second.at(k));
1025 
1026  /* float b = (covariance(predictor,d.t->y) / */
1027  /* variance(predictor)); */
1028  /* univariate_weights.at(j) = fabs(b); */
1029 
1030  pass = true;
1031  shared_ptr<CLabels> yhat = ml.fit(predictor, d.t->y, this->params,
1032  pass);
1033  if (pass)
1034  univariate_weights.at(j) = ml.get_weights().at(0);
1035  else
1036  univariate_weights.at(j) = 0;
1037 
1038  ++j;
1039  }
1040  return univariate_weights;
1041 
1042 }
1044 {
1049  best_ind = Individual();
1050  best_ind.set_id(0);
1051  int j;
1052  int n_x = d.t->X.rows();
1053  int n_z = d.t->Z.size();
1054  int n_feats = std::min(params.max_dim, unsigned(n_x+ n_z));
1055  /* int n_long_feats = std::min(params.max_dim - n_feats, */
1056  /* unsigned(d.t->Z.size())); */
1057  bool univariate_initialization = false;
1058 
1059  if (n_feats < (n_x + n_z))
1060  {
1061  // if the data has more features than params.max_dim, fit a univariate
1062  // linear model to each feature in order to set initial weights
1063  univariate_initialization = true;
1064  vector<float> univariate_weights = univariate_initial_model(d,
1065  n_feats);
1066 
1067  vector<size_t> feature_order = argsort(univariate_weights, false);
1068  feature_order.erase(feature_order.begin()+n_feats,
1069  feature_order.end());
1070 
1071  for (const auto& f : feature_order)
1072  {
1073  if (f < n_x)
1074  best_ind.program.push_back(params.terminals.at(f)->clone());
1075  else
1076  {
1077  best_ind.program.push_back(params.terminals.at(f)->clone());
1078  best_ind.program.push_back(
1079  std::unique_ptr<Node>(new NodeMedian()));
1080  }
1081 
1082  }
1083  params.set_term_weights(univariate_weights);
1084  }
1085  else
1086  {
1087  for (unsigned i =0; i<n_x; ++i)
1088  {
1089  best_ind.program.push_back(params.terminals.at(i)->clone());
1090  }
1091  // if there is longitudinal data, initialize the model with median
1092  // values applied to those variables.
1093  for (unsigned i =0; i<n_z; ++i)
1094  {
1095  best_ind.program.push_back(params.terminals.at(n_x + i)->clone());
1096  best_ind.program.push_back(
1097  std::unique_ptr<Node>(new NodeMedian()));
1098  }
1099  }
1100  // fit model
1101 
1102  shared_ptr<CLabels> yhat;
1103 
1104 
1105  if (univariate_initialization)
1106  {
1107  yhat = best_ind.fit(*d.t,params);
1108  }
1109  else
1110  {
1111  // tune default ML parameters
1112  if (params.tune_initial)
1113  yhat = best_ind.fit_tune(*d.t, params, true);
1114  else
1115  yhat = best_ind.fit(*d.t, params);
1116  // set terminal weights based on model
1117  vector<float> w = best_ind.ml->get_weights();
1118 
1120  }
1121 
1122  this->min_loss = evaluator.S.score(d.t->y, yhat, params.class_weights);
1123 
1124  if (params.split < 1.0)
1125  {
1126  shared_ptr<CLabels> yhat_v = best_ind.predict(*d.v);
1127  this->min_loss_v = evaluator.S.score(d.v->y, yhat_v,
1129  }
1130  else
1131  this->min_loss_v = min_loss;
1132 
1134 
1136 
1137  logger.log("initial model: " + this->get_eqn(), 2);
1138  logger.log("initial training score: " +std::to_string(min_loss),2);
1139  logger.log("initial validation score: " +std::to_string(this->min_loss_v),2);
1140 }
1141 
1142 MatrixXf Feat::transform(MatrixXf& X)
1143 {
1144  LongData Z;
1145  return transform(X,Z);
1146 }
1147 MatrixXf Feat::transform(MatrixXf& X, LongData& Z)
1148 {
1149  return transform(X,Z,nullptr);
1150 }
1151 MatrixXf Feat::transform(MatrixXf& X,
1152  LongData Z,
1153  Individual *ind)
1154 {
1159  if (params.normalize)
1160  N.normalize(X);
1161 
1162  VectorXf y = VectorXf();
1163 
1164  Data d(X, y, Z, get_classification());
1165 
1166  if (ind == 0) // if ind is empty, predict with best_ind
1167  {
1168  if (best_ind.program.size()==0)
1169  THROW_RUNTIME_ERROR("You need to train a model using fit() "
1170  "before making predictions.");
1171 
1172  return best_ind.out(d, true).transpose();
1173  }
1174 
1175  return ind->out(d, true).transpose();
1176 }
1177 
1178 VectorXf Feat::predict(MatrixXf& X)
1179 {
1180  auto Z = LongData();
1181  return predict(X,Z);
1182 }
1183 
1184 VectorXf Feat::predict(MatrixXf& X,
1185  LongData& Z)
1186 {
1187  /* MatrixXf Phi = transform(X, Z); */
1188  if (params.normalize)
1189  N.normalize(X);
1190  VectorXf dummy;
1191  Data d_tmp(X, dummy, Z);
1192  return best_ind.predict_vector(d_tmp);
1193 }
1194 
1195 VectorXf Feat::predict_archive(int id, MatrixXf& X)
1196 {
1197  LongData Z;
1198  return predict_archive(id, X, Z);
1199 }
1200 
1201 VectorXf Feat::predict_archive(int id, MatrixXf& X, LongData& Z)
1202 {
1203  /* cout << "Feat::predict_archive\n"; */
1204  /* return predictions; */
1205  /* cout << "Normalize" << endl; */
1206  if (params.normalize)
1207  N.normalize(X);
1208  /* cout << "params.n_classes:" << params.n_classes << endl; */
1209  /* cout << "X.cols(): " << X.cols() << endl; */
1210  VectorXf predictions(X.cols());
1211  VectorXf empty_y;
1212  /* cout << "tmp_data\n"; */
1213  Data tmp_data(X,empty_y,Z);
1214 
1215  /* cout << "individual prediction id " << id << "\n"; */
1216  if (id == best_ind.id)
1217  {
1218  return best_ind.predict_vector(tmp_data);
1219  }
1220  for (int i = 0; i < this->archive.individuals.size(); ++i)
1221  {
1222  Individual& ind = this->archive.individuals.at(i);
1223 
1224  if (id == ind.id)
1225  return ind.predict_vector(tmp_data);
1226 
1227  }
1228  for (int i = 0; i < this->pop.individuals.size(); ++i)
1229  {
1230  Individual& ind = this->pop.individuals.at(i);
1231 
1232  if (id == ind.id)
1233  return ind.predict_vector(tmp_data);
1234 
1235  }
1236 
1237  THROW_INVALID_ARGUMENT("Could not find id = "
1238  + to_string(id) + "in archive or population.");
1239  return VectorXf();
1240 }
1241 
1242 ArrayXXf Feat::predict_proba_archive(int id, MatrixXf& X)
1243 {
1244  LongData Z;
1245  return predict_proba_archive(id, X, Z);
1246 }
1247 ArrayXXf Feat::predict_proba_archive(int id, MatrixXf& X, LongData& Z)
1248 {
1249  if (params.normalize)
1250  N.normalize(X);
1251  ArrayXXf predictions(X.cols(),params.n_classes);
1252  VectorXf empty_y;
1253  Data tmp_data(X,empty_y,Z);
1254 
1255  for (int i = 0; i < this->archive.individuals.size(); ++i)
1256  {
1257  Individual& ind = this->archive.individuals.at(i);
1258 
1259  if (id == ind.id)
1260  return ind.predict_proba(tmp_data);
1261 
1262  }
1263 
1264  THROW_INVALID_ARGUMENT("Could not find id = "
1265  + to_string(id) + "in archive.");
1266  return ArrayXXf();
1267 
1268 }
1269 shared_ptr<CLabels> Feat::predict_labels(MatrixXf& X, LongData Z)
1270 {
1271  /* MatrixXf Phi = transform(X, Z); */
1272  if (params.normalize)
1273  N.normalize(X);
1274  VectorXf empty_y;
1275  Data tmp_data(X,empty_y,Z);
1276 
1277  return best_ind.predict(tmp_data);
1278 }
1279 
1280 ArrayXXf Feat::predict_proba(MatrixXf& X, LongData& Z)
1281 {
1282  if (params.normalize)
1283  N.normalize(X);
1284  VectorXf dummy;
1285  Data d_tmp(X, dummy, Z);
1286  return best_ind.predict_proba(d_tmp);
1287 }
1288 
1289 ArrayXXf Feat::predict_proba(MatrixXf& X)
1290 {
1291  LongData Z;
1292  return predict_proba(X,Z);
1293 }
1294 
1295 
1296 bool Feat::update_best(const DataRef& d, bool validation)
1297 {
1298  float bs;
1299  bs = this->min_loss_v;
1300  float f;
1301  vector<Individual>& pop_ref = (use_arch ?
1302  archive.individuals : this->pop.individuals);
1303 
1304  bool updated = false;
1305 
1306  for (const auto& ind: pop_ref)
1307  {
1308  if (!val_from_arch || ind.rank == 1)
1309  {
1310  f = ind.fitness_v;
1311 
1312  if (f < bs
1313  || (f == bs && ind.get_complexity() < this->best_complexity)
1314  )
1315  {
1316  bs = f;
1317  this->best_ind = ind; // should this be ind.clone(best_ind); ?
1318  /* ind.clone(best_ind); */
1319  this->best_complexity = ind.get_complexity();
1320  updated = true;
1321  logger.log("better model found!", 2);
1322  }
1323  }
1324  }
1325  logger.log("current best model: " + this->get_eqn(), 2);
1326  this->min_loss_v = bs;
1327 
1328  return updated;
1329 }
1330 
1331 float Feat::score(MatrixXf& X, const VectorXf& y, LongData Z)
1332 {
1333  shared_ptr<CLabels> labels = predict_labels(X, Z);
1334  VectorXf loss;
1335  return evaluator.S.score(y,labels,loss,params.class_weights);
1336 }
1337 
1339 {
1340 
1341  VectorXf losses(this->pop.size());
1342  int i=0;
1343  for (const auto& p: this->pop.individuals)
1344  {
1345  losses(i) = p.fitness;
1346  ++i;
1347  }
1348  // min loss
1349  float min_loss = losses.minCoeff();
1350 
1351  // median loss
1352  float med_loss = median(losses.array());
1353 
1354  // median program size
1355  ArrayXf Sizes(this->pop.size());
1356 
1357  i = 0;
1358 
1359  for (const auto& p : this->pop.individuals)
1360  {
1361  Sizes(i) = p.size();
1362  ++i;
1363  }
1364  unsigned med_size = median(Sizes);
1365 
1366  // complexity
1367  ArrayXf Complexities(this->pop.size());
1368  i = 0;
1369  for (auto& p : this->pop.individuals)
1370  {
1371  // Calculate to assure it gets reported in stats (even if's not used as an obj)
1372  Complexities(i) = p.get_complexity();
1373  ++i;
1374  }
1375 
1376  // number of parameters
1377  ArrayXf Nparams(this->pop.size());
1378  i = 0;
1379  for (auto& p : this->pop.individuals)
1380  {
1381  Nparams(i) = p.get_n_params();
1382  ++i;
1383  }
1384 
1385  // dimensions
1386  ArrayXf Dims(this->pop.size());
1387  i = 0;
1388  for (auto& p : this->pop.individuals)
1389  {
1390  Dims(i) = p.get_dim();
1391  ++i;
1392  }
1393 
1394  /* unsigned med_size = median(Sizes); */
1395  unsigned med_complexity = median(Complexities);
1396  unsigned med_num_params = median(Nparams);
1397  unsigned med_dim = median(Dims);
1398 
1399  // calculate the median valiation loss
1400  ArrayXf val_fitnesses(this->pop.individuals.size());
1401  for (unsigned i = 0; i < this->pop.individuals.size(); ++i)
1402  val_fitnesses(i) = this->pop.individuals.at(i).fitness_v;
1403  float med_loss_v = median(val_fitnesses);
1404  /* fitnesses.push_back(pop.individuals.at(i).fitness); */
1405  /* int idx = argmiddle(fitnesses); */
1406 
1407  /* if (params.split < 1.0) */
1408  /* { */
1409  /* Individual& med_ind = pop.individuals.at(idx); */
1410  /* VectorXf tmp; */
1411  /* shared_ptr<CLabels> yhat_v = med_ind.predict(*d.v, params); */
1412  /* this->med_loss_v = p_eval->S.score(d.v->y, yhat_v, tmp, */
1413  /* params.class_weights); */
1414  /* } */
1415 
1416  /* ///////////////////////////////////////////// */
1417 
1418  // update stats
1420  timer.Elapsed().count(),
1421  min_loss,
1422  this->min_loss_v,
1423  med_loss,
1424  med_loss_v,
1425  med_size,
1426  med_complexity,
1427  med_num_params,
1428  med_dim);
1429 }
1430 
1431 void Feat::print_stats(std::ofstream& log, float fraction)
1432 {
1433  unsigned num_models = std::min(50,this->pop.size());
1434  //float med_loss = median(F.colwise().mean().array()); // median loss
1435  // collect program sizes
1436  ArrayXf Sizes(this->pop.size());
1437  unsigned i = 0;
1438  for (const auto& p : this->pop.individuals)
1439  {
1440  Sizes(i) = p.size(); ++i;
1441  }
1442  unsigned max_size = Sizes.maxCoeff();
1443  // progress bar
1444  string bar, space = "";
1445  for (unsigned int i = 0; i<50; ++i)
1446  {
1447  if (i <= 50*fraction) bar += "/";
1448  else space += " ";
1449  }
1450  std::cout.precision(5);
1451  std::cout << std::scientific;
1452 
1453  if(params.max_time == -1)
1454  std::cout << "Generation " << params.current_gen+1 << "/"
1455  << params.gens << " [" + bar + space + "]\n";
1456  else
1457  std::cout << std::fixed << "Time elapsed "<< timer
1458  << "/" << params.max_time
1459  << " seconds (Generation "<< params.current_gen+1
1460  << ") [" + bar + space + "]\n";
1461 
1462  std::cout << std::fixed << "Train Loss (Med): "
1463  << stats.min_loss.back() << " ("
1464  << stats.med_loss.back() << ")\n"
1465  << "Val Loss (Med): "
1466  << this->min_loss_v << " (" << stats.med_loss_v.back() << ")\n"
1467  << "Median Size (Max): "
1468  << stats.med_size.back() << " (" << max_size << ")\n"
1469  << "Time (s): " << timer << "\n";
1470  std::cout << "Representation Pareto Front--------------------------------------\n";
1471  std::cout << "Rank\t"; //Complexity\tLoss\tRepresentation\n";
1472  /* for (const auto& o : params.objectives) */
1473  /* std::cout << o << "\t"; */
1474  cout << "fitness\tfitness_v\tcomplexity\t";
1475  cout << "Representation\n";
1476 
1477  std::cout << std::scientific;
1478  // printing max 40 individuals from the pareto front
1479  unsigned n = 1;
1480  if (use_arch)
1481  {
1482  num_models = std::min(40, int(archive.individuals.size()));
1483 
1484  for (unsigned i = 0; i < num_models; ++i)
1485  {
1486  std::string lim_model;
1487 
1488  std::string model = this->get_ind_eqn(false, archive.individuals[i]);
1489  /* std::string model = archive.individuals[i].get_eqn(); */
1490  for (unsigned j = 0; j< std::min(model.size(),size_t(60)); ++j)
1491  {
1492  lim_model.push_back(model.at(j));
1493  }
1494  if (lim_model.size()==60)
1495  lim_model += "...";
1496 
1497  std::cout << archive.individuals[i].rank << "\t"
1498  /* for (const auto& o : archive.individuals[i].obj) */
1499  /* std::cout << o << "\t"; */
1500  << archive.individuals[i].fitness << "\t"
1501  << archive.individuals[i].fitness_v << "\t"
1502  << archive.individuals[i].get_complexity() << "\t" ;
1503  cout << lim_model << "\n";
1504  }
1505  }
1506  else
1507  {
1508  vector<size_t> f = this->pop.sorted_front(n);
1509  vector<size_t> fnew(2,0);
1510  while (f.size() < num_models && fnew.size()>1)
1511  {
1512  fnew = this->pop.sorted_front(++n);
1513  f.insert(f.end(),fnew.begin(),fnew.end());
1514  }
1515 
1516  for (unsigned j = 0; j < std::min(num_models,unsigned(f.size())); ++j)
1517  {
1518  std::string lim_model;
1519  std::string model = this->get_ind_eqn(false,pop.individuals[f[j]]);
1520  /* std::string model = this->pop.individuals[f[j]].get_eqn(); */
1521  for (unsigned j = 0; j< std::min(model.size(),size_t(60)); ++j)
1522  lim_model.push_back(model.at(j));
1523  if (lim_model.size()==60)
1524  lim_model += "...";
1525  std::cout << pop.individuals[f[j]].rank << "\t"
1526  << pop.individuals[f[j]].fitness << "\t"
1527  << pop.individuals[f[j]].fitness_v << "\t"
1528  << pop.individuals[f[j]].get_complexity() << "\t" ;
1529  cout << "\t" << lim_model << "\n";
1530  }
1531  }
1532 
1533  std::cout <<"\n\n";
1534 }
1535 
1536 void Feat::log_stats(std::ofstream& log)
1537 {
1538  // print stats in tabular format
1539  string sep = ",";
1540  if (params.current_gen == 0) // print header
1541  {
1542  log << "generation" << sep
1543  << "time" << sep
1544  << "min_loss" << sep
1545  << "min_loss_val" << sep
1546  << "med_loss" << sep
1547  << "med_loss_val" << sep
1548  << "med_size" << sep
1549  << "med_complexity" << sep
1550  << "med_num_params" << sep
1551  << "med_dim" << "\n";
1552  }
1553  log << params.current_gen << sep
1554  << timer.Elapsed().count() << sep
1555  << stats.min_loss.back() << sep
1556  << this->min_loss_v << sep
1557  << stats.med_loss.back() << sep
1558  << stats.med_loss_v.back() << sep
1559  << stats.med_size.back() << sep
1560  << stats.med_complexity.back() << sep
1561  << stats.med_num_params.back() << sep
1562  << stats.med_dim.back() << "\n";
1563 }
1564 
1565 //TODO: replace these with json
1567 {
1568  json j;
1569  to_json(j, this->stats);
1570  return j;
1571 }
1572 
1573 void Feat::load_best_ind(string filename)
1574 {
1575  //TODO: need to load/save normalizer
1576  this->best_ind.load(filename);
1577 }
1578 
1579 void Feat::load_population(string filename, bool justfront)
1580 {
1581  this->pop.load(filename);
1582 }
1583 
1584 void Feat::load(const json& j)
1585 {
1586  // json j = json::parse(feat_state);
1587  from_json(j, *this);
1588 }
1589 
1590 json Feat::save() const
1591 {
1592  json j;
1593  to_json(j, *this);
1594  return j;
1595 }
1596 
1597 void Feat::load_from_file(string filename)
1598 {
1599  std::ifstream indata;
1600  indata.open(filename);
1601  if (!indata.good())
1602  THROW_INVALID_ARGUMENT("Invalid input file " + filename + "\n");
1603 
1604  std::string line;
1605  indata >> line;
1606 
1607  this->load(line);
1608 
1609  logger.log("Loaded Feat state from " + filename,1);
1610 
1611  indata.close();
1612 }
1613 
1614 void Feat::save_to_file(string filename)
1615 {
1616  std::ofstream out;
1617  if (!filename.empty())
1618  out.open(filename);
1619  else
1620  out.open("Feat.json");
1621 
1622  out << this->save();
1623  out.close();
1624  logger.log("Saved Feat to file " + filename, 1);
1625 }
void setTrainingData(MatrixXf &X_t, VectorXf &y_t, LongData &Z_t, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:195
Data * t
Definition: data.h:93
Data * v
Definition: data.h:92
Data * o
Definition: data.h:91
void train_test_split(bool shuffle, float split)
splits data into training and validation folds.
Definition: data.cc:362
void setValidationData(MatrixXf &X_v, VectorXf &y_v, LongData &Z_v, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:214
data holding X, y, and Z data
Definition: data.h:42
VectorXf & y
Definition: data.h:46
void get_batch(Data &db, int batch_size) const
select random subset of data for training weights.
Definition: data.cc:79
LongData & Z
Definition: data.h:47
MatrixXf & X
Definition: data.h:45
void set_protected_groups()
Definition: data.cc:29
evaluation mixin class for Feat
Definition: evaluation.h:34
void validation(vector< Individual > &individuals, const Data &d, const Parameters &params, bool offspring=false)
validation of population.
Definition: evaluation.cc:22
void fitness(vector< Individual > &individuals, const Data &d, const Parameters &params, bool offspring=false)
fitness of population.
Definition: evaluation.cc:71
float score(const VectorXf &y_true, const shared_ptr< CLabels > &yhat, VectorXf &loss, const vector< float > &w)
Definition: scorer.cc:41
void set_backprop(bool bp)
set constant optimization options
Definition: feat.cc:385
int get_dim()
get dimensionality of best
Definition: feat.cc:567
int get_max_size()
return max size of programs
Definition: feat.cc:446
void calculate_stats(const DataRef &d)
calculate and print stats
Definition: feat.cc:1338
void set_selection(string sel)
set selection method
Definition: feat.cc:316
void set_root_xo_rate(float cross_rate)
set root xo rate in variation
Definition: feat.cc:333
ArrayXXf predict_proba(MatrixXf &X, LongData &Z)
predict probabilities of each class.
Definition: feat.cc:1280
void load(const json &j)
load Feat state from a json string.
Definition: feat.cc:1584
void set_random_state(int random_state)
set dimensionality as multiple of the number of columns
Definition: feat.cc:355
void set_corr_delete_mutate(bool s)
Definition: feat.cc:389
void update_stall_count(unsigned &stall_count, bool updated)
updates stall count for early stopping
Definition: feat.cc:754
string get_model(bool sort=true)
return best model, in tabular form
Definition: feat.cc:527
void set_gens(int gens)
set size of max generations
Definition: feat.cc:298
void set_split(float sp)
set train fraction of dataset
Definition: feat.cc:368
float score(MatrixXf &X, const VectorXf &y, LongData Z=LongData())
scoring function
Definition: feat.cc:1331
void load_best_ind(string filename)
load best_ind from file
Definition: feat.cc:1573
void set_dtypes(vector< char > dtypes)
set data types for input parameters
Definition: feat.cc:371
int save_pop
controls whether pop is printed each gen
Definition: feat.h:423
void set_erc(bool erc)
flag to set whether to use variable or constants for terminals
Definition: feat.cc:362
void save_to_file(string filename)
save Feat state to file.
Definition: feat.cc:1614
string starting_pop
file with starting population
Definition: feat.h:420
float simplify
post-run simplification
Definition: feat.h:425
void set_classification(bool classification)
set EProblemType for shogun
Definition: feat.cc:304
float get_split()
return fraction of data to use for training
Definition: feat.cc:464
void set_scorer(string s)
set scoring function
Definition: feat.cc:380
int get_max_depth()
return max_depth of programs
Definition: feat.cc:440
void set_fb(float fb)
set feedback
Definition: feat.cc:374
int best_complexity
complexity of the best model
Definition: feat.h:418
string get_ind_eqn(bool sort, Individual &ind)
return best model as a single line equation
Definition: feat.cc:480
void load_population(string filename, bool justfront=false)
load population from file, optionall just Pareto front
Definition: feat.cc:1579
void set_max_time(int time)
set max time in seconds for fit method
Definition: feat.cc:406
float min_loss
current best score
Definition: feat.h:415
void set_is_fitted(bool f)
set flag indicating whether fit has been called
Definition: feat.h:107
Population pop
population of programs
Definition: feat.h:405
json save() const
save and return a json Feat state as string.
Definition: feat.cc:1590
bool use_arch
internal control over use of archive
Definition: feat.h:411
string get_representation()
return best model
Definition: feat.cc:476
ArrayXf get_coefs()
return the coefficients or importance scores of the best model.
Definition: feat.cc:649
vector< float > univariate_initial_model(DataRef &d, int n_feats)
Definition: feat.cc:979
int get_n_params()
get number of parameters in best
Definition: feat.cc:564
vector< char > get_otypes()
return program output type ('f', 'b')
Definition: feat.cc:434
string get_scorer_()
Definition: feat.cc:381
void print_stats(std::ofstream &log, float fraction)
Definition: feat.cc:1431
void set_protected_groups(string pg)
set protected groups for fairness
Definition: feat.cc:410
void set_use_batch()
set flag to use batch for training
Definition: feat.cc:408
void set_simplify(float s)
Definition: feat.cc:387
void initial_model(DataRef &d)
method to fit inital ml model
Definition: feat.cc:1043
bool get_erc()
return boolean value of erc flag
Definition: feat.cc:452
int get_verbosity()
return current verbosity level set
Definition: feat.cc:437
void run_generation(unsigned int g, vector< size_t > survivors, DataRef &d, std::ofstream &log, float percentage, unsigned &stall_count)
Definition: feat.cc:675
MatrixXf transform(MatrixXf &X)
transform an input matrix using a program.
Definition: feat.cc:1142
bool val_from_arch
model selection only uses Pareto front
Definition: feat.h:424
bool get_classification()
return type of classification flag set
Definition: feat.cc:428
Individual best_ind
best individual
Definition: feat.h:421
int get_max_stall()
return maximum stall in learning, in generations
Definition: feat.cc:431
VectorXf predict_archive(int id, MatrixXf &X)
predict on unseen data from the whole archive
Definition: feat.cc:1195
nl::json get_stats()
return statistics from the run as a json string
Definition: feat.cc:1566
void load_from_file(string filename)
load Feat state from file.
Definition: feat.cc:1597
int get_complexity()
get dimensionality of best
Definition: feat.cc:570
bool update_best(const DataRef &d, bool val=false)
updates best score
Definition: feat.cc:1296
void final_model(DataRef &d)
fits final model to best transformation
Definition: feat.cc:770
Log_Stats stats
runtime stats
Definition: feat.h:426
int get_pop_size()
return population size
Definition: feat.cc:419
void set_iters(int iters)
Definition: feat.cc:393
string get_eqn(bool sort=false)
Definition: feat.cc:478
vector< nl::json > get_archive(bool front)
return population as string
Definition: feat.cc:577
void set_pop_size(int pop_size)
set size of population
Definition: feat.cc:295
void set_n_jobs(unsigned t)
set number of threads
Definition: feat.cc:404
Variation variator
variation operators
Definition: feat.h:408
string survival
stores survival mode
Definition: feat.h:412
void set_hillclimb(bool hc)
Definition: feat.cc:391
void set_otype(char ot)
set program output type ('f', 'b')
Definition: feat.cc:339
LongData get_Z(string s, int *idx, int idx_size)
get longitudinal data from file s
Definition: feat.cc:657
void set_ml(string ml)
set ML algorithm to use
Definition: feat.cc:301
int get_max_dim()
return max dimensionality of programs
Definition: feat.cc:449
void set_lr(float lr)
Definition: feat.cc:395
Evaluation evaluator
evaluation code
Definition: feat.h:407
void set_logfile(string s)
set name for files
Definition: feat.cc:377
void log_stats(std::ofstream &log)
Definition: feat.cc:1536
shared_ptr< CLabels > predict_labels(MatrixXf &X, LongData Z=LongData())
predict on unseen data. return CLabels.
Definition: feat.cc:1269
Selection selector
selection algorithm
Definition: feat.h:406
int get_gens()
return size of max generations
Definition: feat.cc:422
void set_shuffle(bool sh)
flag to shuffle the input samples for train/test splits
Definition: feat.cc:365
float min_loss_v
best validation score
Definition: feat.h:416
int get_n_nodes()
return the number of nodes in the best model
Definition: feat.cc:574
string get_logfile()
get name
Definition: feat.cc:455
string logfile
log filename
Definition: feat.h:422
Parameters params
hyperparameters of Feat
Definition: feat.h:401
string get_ml()
return ML algorithm string
Definition: feat.cc:425
void set_max_dim(unsigned int max_dim)
set maximum dimensionality of programs
Definition: feat.cc:349
int get_num_features()
return number of features
Definition: feat.cc:458
Archive archive
pareto front archive
Definition: feat.h:410
void set_max_depth(unsigned int max_depth)
set max depth of programs
Definition: feat.cc:343
void init()
initialize Feat object for fitting.
Definition: feat.cc:24
void set_verbosity(int verbosity)
set level of debug info
Definition: feat.cc:310
void set_survival(string surv)
set survivability
Definition: feat.cc:319
ArrayXXf predict_proba_archive(int id, MatrixXf &X, LongData &Z)
Definition: feat.cc:1247
void set_cross_rate(float cross_rate)
set cross rate in variation
Definition: feat.cc:326
void set_batch_size(int bs)
Definition: feat.cc:397
string get_scorer()
Definition: feat.cc:382
vector< char > get_dtypes()
return data types for input parameters
Definition: feat.cc:470
Selection survivor
survival algorithm
Definition: feat.h:409
float get_fb()
get feedback setting
Definition: feat.cc:473
Timer timer
start time of training
Definition: feat.h:403
bool get_shuffle()
return whether option to shuffle the data is set or not
Definition: feat.cc:461
float get_cross_rate()
return cross rate for variation
Definition: feat.cc:443
VectorXf predict(MatrixXf &X, LongData &Z)
predict on unseen data.
Definition: feat.cc:1184
void simplify_model(DataRef &d, Individual &)
simplifies final model to best transformation
Definition: feat.cc:786
Normalizer N
scales training data.
Definition: feat.h:413
void fit(MatrixXf &X, VectorXf &y)
train a model.
Definition: feat.cc:668
void set_max_stall(int max_stall)
set maximum stall in learning, in generations
Definition: feat.cc:313
class that specifies the machine learning algorithm to pair with Feat.
Definition: ml.h:80
vector< float > get_weights(bool norm_adjust=true) const
Definition: ml.cc:211
shared_ptr< CLabels > fit(const MatrixXf &X, const VectorXf &y, const Parameters &params, bool &pass, const vector< char > &dtypes=vector< char >())
Definition: ml.cc:282
individual programs in the population
Definition: individual.h:31
vector< string > get_features()
return vectorized representation of program
Definition: individual.cc:817
int size() const
return size of program
Definition: individual.cc:93
MatrixXf out(const Data &d, bool predict=false)
calculate program output matrix Phi
Definition: individual.cc:391
VectorXf yhat
current output
Definition: individual.h:35
string get_eqn()
return symbolic representation of program
Definition: individual.cc:748
ArrayXXf predict_proba(const Data &d)
Definition: individual.cc:293
unsigned id
tracking id
Definition: individual.h:53
MatrixXf Phi
transformation output of program
Definition: individual.h:34
int get_n_params()
get number of params in program
Definition: individual.cc:96
float fitness
aggregate fitness score
Definition: individual.h:38
NodeVector program
executable data structure
Definition: individual.h:33
shared_ptr< ML > ml
ML model, trained on Phi.
Definition: individual.h:37
void save(string filename)
save individual as a json object.
Definition: individual.cc:1050
shared_ptr< CLabels > predict(const Data &d)
Definition: individual.cc:271
shared_ptr< CLabels > fit(const Data &d, const Parameters &params, bool &pass)
fits an ML model to the data after transformation
Definition: individual.cc:234
void set_id(unsigned i)
Definition: individual.cc:112
unsigned int get_complexity() const
get the program complexity without updating it.
Definition: individual.cc:109
VectorXf predict_vector(const Data &d)
Definition: individual.cc:311
void load(string filename)
load individual from a file.
Definition: individual.cc:1061
unsigned int get_dim()
grab sub-tree locations given starting point.
Definition: individual.cc:873
shared_ptr< CLabels > fit_tune(const Data &d, const Parameters &params, bool set_default=false)
fits and tunes an ML model to the data after transformation
Definition: individual.cc:1019
string log(string m, int v, string sep="\n") const
print message with verbosity control.
Definition: logger.cc:54
static void destroy()
Definition: logger.cc:25
static void destroy()
Definition: rnd.cc:33
void set_seed(int new_seed)
Definition: rnd.cc:41
std::chrono::duration< float > Elapsed() const
Definition: utils.cc:211
void Reset()
Definition: utils.cc:207
void delete_mutate(Individual &child, const Parameters &params)
Definition: variation.cc:359
void vary(Population &pop, const vector< size_t > &parents, const Parameters &params, const Data &d)
method to handle variation of population
Definition: variation.cc:40
bool correlation_delete_mutate(Individual &child, MatrixXf Phi, const Parameters &params, const Data &d)
Definition: variation.cc:465
void set_cross_rate(float cr)
update cross rate
Definition: variation.cc:14
std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > LongData
Definition: data.h:23
#define THROW_RUNTIME_ERROR(err)
Definition: error.h:30
#define THROW_INVALID_ARGUMENT(err)
Definition: error.h:31
void __attribute__((constructor)) ctor()
Definition: feat.cc:9
#define initialize_cuda()
Definition: feat.h:35
#define GPU
Definition: feat.h:34
#define omp_set_num_threads(x)
Definition: init.h:15
void my_handler(int s)
handle signals (ctr-c etc.)
Definition: error.cc:43
void load_partial_longitudinal(const std::string &path, std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > &Z, char sep, const vector< int > &idx)
load partial longitudinal csv file into matrix according to idx vector
Definition: io.cc:175
static Logger & logger
Definition: logger.h:46
float median(const ArrayXf &v)
calculate median
Definition: utils.cc:89
vector< size_t > argsort(const vector< T > &v, bool ascending=true)
return indices that sort a vector
Definition: utils.h:81
void printProgress(float percentage)
outputs a progress bar, filled according to
Definition: io.cc:15
static Rnd & r
Definition: rnd.h:135
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
void from_json(const nl::json &, Feat &)
void to_json(nl::json &, const Feat &)
float learning_rate
Definition: params.h:86
bool use_batch
whether to use mini batch for training
Definition: params.h:68
bool backprop
turns on backpropagation
Definition: params.h:65
void set_terminals(int nf, const LongData &Z)
set the terminals with longitudinal data
Definition: params.cc:659
void set_sample_weights(VectorXf &y)
sets the weights of each sample (and class weights)
Definition: params.cc:749
vector< char > dtypes
data types of input parameters
Definition: params.h:55
unsigned int max_size
max size of programs (length)
Definition: params.h:48
void set_current_gen(int g)
sets current generation
Definition: params.cc:109
unsigned int max_dim
maximum dimensionality of programs
Definition: params.h:49
bool classification
flag to conduct classification rather than
Definition: params.h:32
int max_time
max time for fit method
Definition: params.h:67
int max_stall
maximum stall in learning, in generations
Definition: params.h:33
int n_jobs
number of parallel jobs
Definition: params.h:81
float cross_rate
cross rate for variation
Definition: params.h:58
float feedback
strength of ml feedback on probabilities
Definition: params.h:56
vector< float > class_weights
weights for each class
Definition: params.h:60
bool hillclimb
turns on parameter hill climbing
Definition: params.h:66
unsigned int n_classes
number of classes for classification
Definition: params.h:57
string scorer
loss function argument
Definition: params.h:62
void init(const MatrixXf &X, const VectorXf &y)
Definition: params.cc:82
void set_otype(char ot)
Definition: params.cc:223
bool erc
whether to include constants for terminals
Definition: params.h:50
void set_max_dim(unsigned int max_dim)
set maximum dimensionality of programs
Definition: params.cc:217
unsigned int max_depth
max depth of programs
Definition: params.h:47
int gens
max generations
Definition: params.h:29
string ml
machine learner used with Feat
Definition: params.h:31
void set_max_depth(unsigned int max_depth)
set max depth of programs
Definition: params.cc:210
int pop_size
population size
Definition: params.h:28
HC hc
stochastic hill climbing parameters
Definition: params.h:102
unsigned num_features
number of features
Definition: params.h:51
int verbosity
Definition: params.h:39
bool tune_initial
tune initial ML model
Definition: params.h:77
int current_gen
holds current generation
Definition: params.h:30
BP bp
backprop parameters
Definition: params.h:92
bool normalize
whether to normalize the input data
Definition: params.h:75
vector< string > objectives
Pareto objectives.
Definition: params.h:52
float root_xo_rate
crossover
Definition: params.h:73
NodeVector terminals
terminal nodes available in programs vector storing longitudinal data keys
Definition: params.h:43
void set_verbosity(int verbosity)
set level of debug info
Definition: params.cc:712
vector< bool > protected_groups
protected attributes in X
Definition: params.h:76
void set_protected_groups(string fn)
Definition: params.cc:386
vector< char > otypes
program output types ('f', 'b')
Definition: params.h:34
float split
fraction of data to use for training
Definition: params.h:54
bool shuffle
option to shuffle the data
Definition: params.h:53
void set_term_weights(const vector< float > &w)
sets weights for terminals.
Definition: params.cc:144
void set_scorer(string sc="", bool initialized=false)
sets scorer type
Definition: params.cc:112
bool tune_final
tune final ML model string of comma-delimited operator names, used to choose functions
Definition: params.h:78
string scorer_
actual loss function used, determined by scorer
Definition: params.h:63
bool corr_delete_mutate
use correlation delete mutation
Definition: params.h:72
int random_state
random seed
Definition: params.h:27
void update(const Population &pop, const Parameters &params)
Definition: archive.cc:76
vector< Individual > individuals
individual programs in the archive
Definition: archive.h:28
void set_objectives(vector< string > objectives)
Definition: archive.cc:14
vector< size_t > roots() const
returns indices of root nodes
Definition: nodevector.cc:55
size_t subtree(size_t i, char otype='0', string indent="> ") const
Definition: nodevector.cc:80
Defines a population of programs and functions for constructing them.
Definition: population.h:28
void load(string filename)
Definition: population.cc:165
void update(vector< size_t > survivors)
reduce programs to the indices in survivors.
Definition: population.cc:97
string print_eqns(bool just_offspring=false, string sep="\n")
return population equations.
Definition: population.cc:121
int size()
returns population size
Definition: population.cc:31
vector< size_t > sorted_front(unsigned)
return complexity-sorted Pareto front indices.
Definition: population.cc:135
void init(const Individual &starting_model, const Parameters &params, bool random=false, string filename="")
initialize population of programs with a starting model and/or from file
Definition: population.cc:38
vector< Individual > individuals
individual programs
Definition: population.h:29
void save(string filename)
Definition: population.cc:150
interfaces with selection operators.
Definition: selection.h:36
string get_type()
return type of selectionoperator
Definition: selection.cc:55
vector< size_t > survive(Population &pop, const Parameters &params, const Data &d)
perform survival
Definition: selection.cc:68
vector< size_t > select(Population &pop, const Parameters &params, const Data &d)
perform selection
Definition: selection.cc:61
vector< unsigned > med_size
Definition: utils.h:389
vector< float > med_loss_v
Definition: utils.h:388
vector< unsigned > med_num_params
Definition: utils.h:391
vector< unsigned > med_dim
Definition: utils.h:392
void update(int index, float timer_count, float bst_score, float bst_score_v, float md_score, float md_loss_v, unsigned md_size, unsigned md_complexity, unsigned md_num_params, unsigned md_dim)
Definition: utils.cc:274
vector< float > min_loss
Definition: utils.h:385
vector< unsigned > med_complexity
Definition: utils.h:390
vector< float > med_loss
Definition: utils.h:387
normalizes a matrix to unit variance, 0 mean centered.
Definition: utils.h:147
void fit_normalize(MatrixBase< T > &X, const vector< char > &dtypes)
fit then normalize
Definition: utils.h:315
void normalize(MatrixBase< T > &X) const
normalize matrix.
Definition: utils.h:191