Feat C++ API
A feature engineering automation tool
variation.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "variation.h"
7 
8 namespace FT{
9 namespace Vary{
11 Variation::Variation(float cr): cross_rate(cr) {}
12 
15 {
16  cross_rate = cr;
17 }
18 
21 {
22  return cross_rate;
23 }
24 
27 
28 std::unique_ptr<Node> random_node(const NodeVector & v)
29 {
33  assert(v.size()>0 && " attemping to return random choice from empty vector");
34  std::vector<size_t> vi(v.size());
35  std::iota(vi.begin(), vi.end(), 0);
36  size_t idx = r.random_choice(vi);
37  return v.at(idx)->clone();
38 }
39 
40 void Variation::vary(Population& pop, const vector<size_t>& parents,
41  const Parameters& params, const Data& d)
42 {
52  unsigned start= pop.size();
53  pop.resize(2*params.pop_size);
54  #pragma omp parallel for
55  for (unsigned i = start; i<pop.size(); ++i)
56  {
57  // pass check for children undergoing variation
58  bool pass=false;
59  while (!pass)
60  {
61  Individual child; // new individual
62  child.set_id(params.current_gen*params.pop_size+i-start);
63 
64  if ( r() < cross_rate) // crossover
65  {
66  // get random mom and dad, make copies
67  Individual& mom = pop.individuals.at(r.random_choice(parents));
68  Individual& dad = pop.individuals.at(r.random_choice(parents));
69  /* int dad = r.random_choice(parents); */
70  // create child
71 
72  // perform crossover
73  logger.log("\n===\ncrossing\n" + mom.get_eqn() + "\nwith\n " +
74  dad.get_eqn() , 3);
75  logger.log("programs:\n" + mom.program_str() + "\nwith\n " +
76  dad.program_str() , 3);
77 
78  pass = cross(mom, dad, child, params, d);
79 
80  logger.log("crossing " + mom.get_eqn() + "\nwith\n " +
81  dad.get_eqn() + "\nproduced " + child.get_eqn() +
82  ", pass: " + std::to_string(pass) + "\n===\n",3);
83 
84  child.set_parents({mom, dad});
85  }
86  else // mutation
87  {
88  // get random mom
89  Individual& mom = pop.individuals.at(r.random_choice(parents));
90  /* int mom = r.random_choice(parents); */
91  // create child
92  /* #pragma omp critical */
93  /* { */
94  logger.log("mutating " + mom.get_eqn() + "(" +
95  mom.program_str() + ")", 3);
96  pass = mutate(mom,child,params,d);
97  logger.log("mutating " + mom.get_eqn() + " produced " +
98  child.get_eqn() + ", pass: " + std::to_string(pass),3);
99  /* } */
100  child.set_parents({mom});
101  }
102  if (pass)
103  {
104  assert(child.size()>0);
105  logger.log("assigning " + child.program_str() +
106  " to pop.individuals[" + std::to_string(i) + "]",3);
107 
108  pop.individuals.at(i) = child;
109  }
110  }
111  }
112 }
113 
114 bool Variation::mutate(const Individual& mom, Individual& child,
115  const Parameters& params, const Data& d)
116 {
127  // make child a copy of mom
128  mom.clone(child, false);
129  float rf = r();
130  if (rf < 1.0/3.0 && child.get_dim() > 1)
131  {
132  if (r() < 0.5)
133  {
134  logger.log("\tdeletion mutation",3);
135  delete_mutate(child,params);
136  }
137  else
138  {
139  if (params.corr_delete_mutate)
140  {
141  logger.log("\tcorrelation_delete_mutate",3);
142  bool perfect_correlation = correlation_delete_mutate(
143  child,mom.Phi,params,d);
144  }
145  else
146  {
147  logger.log("\tdelete_dimension_mutate",3);
148  delete_dimension_mutate(child, params);
149  }
150  }
151  assert(child.program.is_valid_program(params.num_features,
152  params.longitudinalMap));
153  }
154  else if (rf < 2.0/3.0 && child.size() < params.max_size)
155  {
156  logger.log("\tinsert mutation",3);
157  insert_mutate(child,params);
158  assert(child.program.is_valid_program(params.num_features,
159  params.longitudinalMap));
160  }
161  else
162  {
163  logger.log("\tpoint mutation",3);
164  point_mutate(child,params);
165  assert(child.program.is_valid_program(params.num_features,
166  params.longitudinalMap));
167  }
168 
169  // check child depth and dimensionality
170  return child.size()>0 && child.size() <= params.max_size
171  && child.get_dim() <= params.max_dim;
172 }
173 
174 void Variation::point_mutate(Individual& child, const Parameters& params)
175 {
181  float n = child.size();
182  unsigned i = 0;
183  // loop thru child's program
184  for (auto& p : child.program)
185  {
186  /* cout << "child.get_p(i): "; */
187  /* cout << child.get_p(i) << "\n"; */
188  if (r() < child.get_p(i)) // mutate p.
189  {
190  logger.log("\t\tmutating node " + p->name, 3);
191  NodeVector replacements; // potential replacements for p
192 
193  if (p->total_arity() > 0) // then it is an instruction
194  {
195  // find instructions with matching in/out types and arities
196  for (const auto& f: params.functions)
197  {
198  if (f->otype == p->otype &&
199  f->arity == p->arity)
200  /* f->arity.at('f')==p->arity.at('f') && */
201  /* f->arity.at('b')==p->arity.at('b') && */
202  /* f->arity.at('c')==p->arity.at('c') && */
203  /* f->arity.at('z')==p->arity.at('z')) */
204  replacements.push_back(f->rnd_clone());
205  }
206  }
207  else // otherwise it is a terminal
208  {
209  // TODO: add terminal weights here
210  // find terminals with matching output types
211 
212  for (const auto& t : params.terminals)
213  {
214  if (t->otype == p->otype)
215  replacements.push_back(t->clone());
216  }
217  }
218  // replace p with a random one
219  if (replacements.size() == 0)
220  {
221  WARN("WARNING: couldn't mutate " +
222  to_string(p->name)+ ", no valid replacements found\n");
223  }
224  else
225  p = random_node(replacements);
226  }
227  ++i;
228  }
229 
230 }
231 
233  const Parameters& params)
234 {
241  float n = child.size();
242 
243  if (r()<0.5 || child.get_dim() == params.max_dim)
244  {
245  // loop thru child's program
246  for (unsigned i = 0; i< child.program.size(); ++i)
247  {
248  // mutate with weighted probability
249  if (r() < child.get_p(i))
250  {
251  logger.log("\t\tinsert mutating node " +
252  child.program.at(i)->name + " with probability " +
253  std::to_string(child.get_p(i)), 3);
254  NodeVector insertion; // inserted segment
255  NodeVector fns; // potential fns
256 
257  // find instructions with matching output types and a
258  // matching arity to i
259  for (const auto& f: params.functions)
260  {
261  // find fns with matching output types that take
262  // this node type as arg
263  if (f->otype==child.program.at(i)->otype
264  && f->arity.at(child.program.at(i)->otype) > 0)
265  {
266  // make sure there are satisfactory types in
267  // terminals to fill fns' args
268  map<char,unsigned> fn_arity = f->arity;
269  --fn_arity.at(child.program.at(i)->otype);
270  bool valid_function = true;
271  for (auto kv : fn_arity)
272  {
273  if (kv.second > 0)
274  {
275  if (!in(params.dtypes, kv.first))
276  {
277  valid_function=false;
278  }
279  }
280 
281  }
282  if (valid_function)
283  fns.push_back(f->rnd_clone());
284  }
285  }
286 
287  if (fns.size()==0) // if no insertion functions match, skip
288  continue;
289 
290  // grab chosen node's subtree
291  int end = i;
292  int start = child.program.subtree(end);
293  logger.log("\t\tinsert mutation from " + to_string(end)
294  + " to " + to_string(start), 3);
295 
296 
297  // choose a function to insert
298  insertion.push_back(random_node(fns));
299  // now we need to manually construct a subtree with this
300  // insertion node, with the child program stiched in as an
301  // argument.
302  map<char, unsigned> insert_arity = insertion.back()->arity;
303 
304  // decrement function arity by one for child node
305  --insert_arity.at(child.program.at(i)->otype);
306 
307  vector<char> type_order = {'f','b','c','z'};
308  for (auto type : type_order)
309  {
310  // add the child now if type matches
311  if (child.program.at(i)->otype==type)
312  {
313  /* cout << "adding " << type << " child program: [" ; */
314  for (int k = end; k != start-1; --k)
315  {
316  /* cout << child.program.at(k)->name ; */
317  /* cout << " (k=" << k << ")\t"; */
318  insertion.push_back(child.program.at(k)->clone());
319  }
320  /* cout << "]\n"; */
321  }
322  // push back new arguments for the rest of the function
323  for (unsigned j = 0; j< insert_arity.at(type); ++j)
324  {
325  insertion.make_tree(params.functions,params.terminals,
326  0, params.term_weights,params.op_weights,
327  type, params.ttypes);
328  }
329  }
330  // post-fix notation
331  std::reverse(insertion.begin(),insertion.end());
332 
333  string s;
334  for (const auto& ins : insertion) s += ins->name + " ";
335  logger.log("\t\tinsertion: " + s + "\n", 3);
336  NodeVector new_program;
337  splice_programs(new_program,
338  child.program, start, end,
339  insertion, size_t(0), insertion.size()-1);
340  child.program=new_program;
341  i += insertion.size()-1;
342  }
343  /* std::cout << "i: " << i << "\n"; */
344  }
345  /* cout << child.get_eqn() << "\n"; */
346  }
347  else // add a dimension
348  {
349  NodeVector insertion; // new dimension
350  insertion.make_program(params.functions, params.terminals, 1,
351  params.term_weights,params.op_weights,1,
352  r.random_choice(params.otypes), params.longitudinalMap,
353  params.ttypes);
354  for (const auto& ip : insertion)
355  child.program.push_back(ip->clone());
356  }
357 }
358 
360  const Parameters& params)
361 {
367  logger.log("\t\tprogram: " + child.program_str(),4);
368  // loop thru child's program
369  for (unsigned i = 0; i< child.program.size(); ++i)
370  {
371  // mutate with weighted probability
372  if (child.program.subtree(i) != i && r() < child.get_p(i))
373  {
374  // get subtree indices of program to delete
375  size_t end = i;
376  size_t start = child.program.subtree(end);
377  string portion="";
378  for (int j=start; j<=end; ++j)
379  {
380  portion += child.program.at(j)->name + " ";
381  }
382  logger.log("\t\tdelete mutating [ " +
383  portion + " ] from program " +
384  child.program_str(), 4);
385 
386  NodeVector terms; // potential fns
387 
388  // find terminals with matching output types to i
389  for (const auto& t: params.terminals)
390  {
391  // find terms with matching output types that take
392  // this node type as arg
393  if ( t->otype==child.program.at(i)->otype )
394  {
395  terms.push_back(t->rnd_clone());
396  }
397  }
398 
399  if (terms.size()==0) // if no insertion terminals match, skip
400  {
401  logger.log("\t\tnevermind, couldn't find a matching terminal",
402  4);
403  continue;
404  }
405 
406  // choose a function to insert
407  std::unique_ptr<Node> insertion = random_node(terms);
408 
409  string s;
410  logger.log("\t\tinsertion: " + insertion->name + "\n", 4);
411 
412  // delete portion of program
413  if (logger.get_log_level() >=4)
414  {
415  std::string s="";
416  for (unsigned i = start; i<=end; ++i)
417  {
418  s+= child.program.at(i)->name + " ";
419  }
420  logger.log("\t\tdeleting " + std::to_string(start) + " to " +
421  std::to_string(end) + ": " + s, 4);
422  }
423  child.program.erase(child.program.begin()+start,
424  child.program.begin()+end+1);
425 
426  // insert the terminal that was chosen
427  child.program.insert(child.program.begin()+start,
428  insertion->clone());
429  logger.log("\t\tresult of delete mutation: " +
430  child.program_str(), 4);
431  continue;
432  }
433  /* std::cout << "i: " << i << "\n"; */
434  }
435 }
436 
438  const Parameters& params)
439 {
445  logger.log("\t\tprogram: " + child.program_str(),3);
446  vector<size_t> roots = child.program.roots();
447 
448  size_t end = r.random_choice(roots,child.p);
449  size_t start = child.program.subtree(end);
450  if (logger.get_log_level() >= 3)
451  {
452  std::string s="";
453  for (unsigned i = start; i<=end; ++i)
454  {
455  s+= child.program.at(i)->name + " ";
456  }
457  logger.log("\t\tdeleting " + std::to_string(start) + " to " +
458  std::to_string(end) + ": " + s, 3);
459  }
460  child.program.erase(child.program.begin()+start,
461  child.program.begin()+end+1);
462  logger.log("\t\tresult of delete mutation: " + child.program_str(), 3);
463 }
464 
466  MatrixXf Phi, const Parameters& params, const Data& d)
467 {
478  logger.log("\t\tprogram: " + child.program_str(),3);
479  // mean center features
480  for (int i = 0; i < Phi.rows(); ++i)
481  {
482  Phi.row(i) = Phi.row(i).array() - Phi.row(i).mean();
483  }
484  /* cout << "Phi: " << Phi.rows() << "x" << Phi.cols() << "\n"; */
485  // calculate highest pairwise correlation and store feature indices
486  float highest_corr = 0;
487  int f1=0, f2 = 0;
488  for (int i = 0; i < Phi.rows()-1; ++i)
489  {
490  for (int j = i+1; j < Phi.rows(); ++j)
491  {
492  float corr = pearson_correlation(Phi.row(i).array(),
493  Phi.row(j).array());
494 
495  /* cout << "correlation (" << i << "," << j << "): " */
496  /* << corr << "\n"; */
497 
498  if (corr > highest_corr)
499  {
500  highest_corr = corr;
501  f1 = i;
502  f2 = j;
503  }
504  }
505  }
506  logger.log("chosen pair: " + to_string(f1) + ", " + to_string(f2)
507  + "; corr = " + to_string(highest_corr), 3);
508  if (f1 == 0 && f2 == 0)
509  {
510  return false;
511  }
512  // pick the feature, f1 or f2, that is less correlated with y
513  float corr_f1 = pearson_correlation(d.y.array()-d.y.mean(),
514  Phi.row(f1).array());
515  float corr_f2 = pearson_correlation(d.y.array()-d.y.mean(),
516  Phi.row(f2).array());
517  logger.log( "corr (" + to_string(f1) + ", y): " + to_string(corr_f1), 3);
518  logger.log( "corr (" + to_string(f2) + ", y): " + to_string(corr_f2), 3);
519  int choice = corr_f1 <= corr_f2 ? f1 : f2;
520  logger.log( "chose (" + to_string(choice), 3);
521  // pick the subtree starting at roots(choice) and delete it
522  vector<size_t> roots = child.program.roots();
523  size_t end = roots.at(choice);
524  size_t start = child.program.subtree(end);
525  if (logger.get_log_level() >=3)
526  {
527  std::string s="";
528  for (unsigned i = start; i<=end; ++i)
529  s+= child.program.at(i)->name + " ";
530  logger.log("\t\tdeleting " + std::to_string(start) + " to " +
531  std::to_string(end) + ": " + s, 3);
532  }
533  child.program.erase(child.program.begin()+start,
534  child.program.begin()+end+1);
535 
536  logger.log("\t\tresult of corr delete mutation: "
537  + child.program_str(), 3);
538 
539  if (!child.program.is_valid_program(d.X.rows(),
540  params.longitudinalMap))
541  {
542  cout << "Error in correlation_delete_mutate: child is not a valid "
543  << "program.\n";
544  cout << child.program_str() << endl;
545  cout << child.get_eqn() << endl;
546  }
547 
548  return highest_corr > 0.999;
549 }
550 
551 bool Variation::cross(const Individual& mom, const Individual& dad,
552  Individual& child, const Parameters& params, const Data& d)
553 {
565  // some of the time, do subtree xo. the reset of the time,
566  // do some version of root crossover.
567  bool subtree = r() > params.root_xo_rate;
568  vector<size_t> mlocs, dlocs; // mom and dad locations for consideration
569  size_t i1, j1, i2, j2; // i1-j1: mom portion, i2-j2: dad portion
570 
571  if (subtree)
572  {
573  logger.log("\tsubtree xo",3);
574  // limit xo choices to matching output types in the programs.
575  vector<char> d_otypes;
576  for (const auto& p : dad.program)
577  if(!in(d_otypes,p->otype))
578  d_otypes.push_back(p->otype);
579 
580  // get valid subtree locations
581  for (size_t i =0; i<mom.size(); ++i)
582  if (in(d_otypes,mom.program.at(i)->otype))
583  mlocs.push_back(i);
584  // mom and dad have no overlapping types, can't cross
585  if (mlocs.size()==0)
586  {
587  logger.log("WARNING: no overlapping types between " +
588  mom.program_str() + "," + dad.program_str() + "\n", 3);
589  return 0;
590  }
591  j1 = r.random_choice(mlocs,mom.get_p(mlocs,true));
592 
593  // get locations in dad's program that match the subtree type picked
594  // from mom
595  for (size_t i =0; i<dad.size(); ++i)
596  {
597  if (dad.program.at(i)->otype == mom.program.at(j1)->otype)
598  dlocs.push_back(i);
599  }
600  }
601  else // half the time, pick a root node
602  {
603  if (params.residual_xo)
604  return residual_cross(mom, dad, child, params, d);
605  else if (params.stagewise_xo)
606  return stagewise_cross(mom, dad, child, params, d);
607  else
608  {
609  logger.log("\troot xo",3);
610  mlocs = mom.program.roots();
611  dlocs = dad.program.roots();
612  logger.log("\t\trandom choice mlocs (size "+
613  std::to_string(mlocs.size())+"), p size: "+
614  std::to_string(mom.p.size()),3);
615  // weighted probability choice
616  j1 = r.random_choice(mlocs,mom.get_p(mlocs));
617  }
618  }
619  /* cout << "mom subtree\t" << mom.program_str() << " starting at " */
620  /* << j1 << "\n"; */
621  // get subtree
622  i1 = mom.program.subtree(j1);
623  /* cout << "mom i1: " << i1 << endl; */
624  /* cout << "dad subtree\n" << dad.program_str() << "\n"; */
625  /* cout << "dad subtree\n"; */
626  // get dad subtree
627  j2 = r.random_choice(dlocs);
628  i2 = dad.program.subtree(j2);
629 
630  /* cout << "splice programs\n"; */
631  // make child program by splicing mom and dad
632  splice_programs(child.program, mom.program, i1, j1, dad.program, i2, j2 );
633 
634  if (logger.get_log_level() >= 3)
635  print_cross(mom,i1,j1,dad,i2,j2,child);
636 
637  assert(child.program.is_valid_program(params.num_features,
638  params.longitudinalMap));
639  // check child depth and dimensionality
640  return (child.size() > 0 && child.size() <= params.max_size
641  && child.get_dim() <= params.max_dim);
642 }
643 
645 bool Variation::residual_cross(const Individual& mom, const Individual& dad,
646  Individual& child, const Parameters& params, const Data& d)
647 {
660  logger.log("\tresidual xo",3);
661  vector<size_t> mlocs, dlocs; // mom and dad locations for consideration
662  // i1-j1: mom portion, i2-j2: dad portion
663  size_t i1, j1, j1_idx, i2, j2;
664 
665  mlocs = mom.program.roots();
666  vector<int> mlocs_indices(mlocs.size());
667  std::iota(mlocs_indices.begin(),mlocs_indices.end(),0);
668 
669  dlocs = dad.program.roots();
670  logger.log("\t\trandom choice mlocs (size "+
671  std::to_string(mlocs.size())+"), p size: "+
672  std::to_string(mom.p.size()),3);
673  // weighted probability choice
674  j1_idx = r.random_choice(mlocs_indices,mom.get_p(mlocs));
675  j1 = mlocs.at(j1_idx);
676  // get subtree
677  i1 = mom.program.subtree(j1);
678 
679  // get dad subtree
680  // choose root in dad that is most correlated with the residual of
681  // j2 = index in dad.Phi that maximizes R2(d.y, w*mom.Phi - w_i1*Phi_i1)
682  /* cout << "mom: " << mom.get_eqn() << "\n"; */
683  /* cout << "mom yhat: " << mom.yhat.transpose() << "\n"; */
684  VectorXf tree = (mom.ml->get_weights().at(j1_idx) *
685  mom.Phi.row(j1_idx).array());
686  /* cout << "tree (idx=" << j1_idx << "): " << tree.transpose() << "\n"; */
687  VectorXf mom_pred_minus_tree = mom.yhat - tree;
688  /* #pragma omp critical */
689  /* { */
690  /* VectorXf mom_pred_minus_tree = mom.predict_drop(d,params,j1_idx); */
691  /* } */
692  /* cout << "mom_pred_minus_tree: " << mom_pred_minus_tree.transpose()
693  * << "\n"; */
694  VectorXf mom_residual = d.y - mom_pred_minus_tree;
695  /* cout << "mom_residual: " << mom_residual.transpose() << "\n"; */
696 
697  // get correlations of dad's features with the residual from mom,
698  // less the swap choice
699  vector<float> corrs(dad.Phi.rows());
700  int best_corr_idx = 0;
701  float best_corr = -1;
702  float corr;
703 
704  for (int i = 0; i < dad.Phi.rows(); ++i)
705  {
706  corr = pearson_correlation(mom_residual.array(),
707  dad.Phi.row(i).array());
708  /* cout << "corr( " << i << "): " << corr << "\n"; */
709  corrs.at(i) = corr; // this can be removed
710  if (corr > best_corr )
711  {
712  best_corr_idx = i;
713  best_corr = corr;
714  }
715  }
716  /* cout << "best_corr_idx: " << best_corr_idx << ", R^2: "
717  * << best_corr << "\n"; */
718  /* cout << "corrs: "; */
719  /* for (auto c : corrs) cout << c << ", "; cout << "\n"; */
720  /* cout << "chose corr at " << best_corr_idx << "\n"; */
721  j2 = dlocs.at(best_corr_idx);
722  i2 = dad.program.subtree(j2);
723 
724  if (logger.get_log_level() >= 3)
725  print_cross(mom,i1,j1,dad,i2,j2,child,false);
726 
727  // make child program by splicing mom and dad
728  splice_programs(child.program, mom.program, i1, j1, dad.program, i2, j2 );
729 
730  if (logger.get_log_level() >= 3)
731  print_cross(mom,i1,j1,dad,i2,j2,child,true);
732 
733 
734  assert(child.program.is_valid_program(params.num_features,
735  params.longitudinalMap));
736  // check child depth and dimensionality
737  return child.size()>0 && child.size() <= params.max_size
738  && child.get_dim() <= params.max_dim;
739 
740 }
741 
743 bool Variation::stagewise_cross(const Individual& mom, const Individual& dad,
744  Individual& child, const Parameters& params, const Data& d)
745 {
767  logger.log("\tstagewise xo",3);
768  // normalize the residual
769  VectorXf R = d.y.array() - d.y.mean();
770  /* cout << "R: " << R.norm() << "\n"; */
771  if (mom.Phi.cols() != dad.Phi.cols())
772  {
773  cout << "!!WARNING!! mom.Phi.cols() = " << mom.Phi.cols()
774  << " and dad.Phi.cols() = " << dad.Phi.cols() << "\n";
775  cout << " d.y size: " << d.y.size() << "\n";
776  cout << "mom: " << mom.program_str() << "\n";
777  cout << "dad: " << dad.program_str() << "\n";
778  return false;
779  }
780  MatrixXf PhiA(mom.Phi.rows()+dad.Phi.rows(), mom.Phi.cols());
781  PhiA << mom.Phi,
782  dad.Phi;
783  /* cout << "mom Phi: " << mom.Phi.rows() << "x" << mom.Phi.cols() << "\n"; */
784  /* cout << "dad Phi: " << dad.Phi.rows() << "x" << dad.Phi.cols() << "\n"; */
785  /* cout << "PhiA: " << PhiA.rows() << "x" << PhiA.cols() << "\n"; */
786  // normalize Phi
787  for (int i = 0; i < PhiA.rows(); ++i)
788  {
789  PhiA.row(i) = PhiA.row(i).array() - PhiA.row(i).mean();
790  /*cout << "PhiA( " << i << ").mean(): " << PhiA.row(i).mean() << "\n";*/
791  }
792  vector<int> sel_idx;
793  float best_corr_idx;
794  unsigned nsel = 0;
795  float deltaR = 1; // keep track of changes to the residual
796  // only keep going when residual is reduced by at least tol
797  float tol = 0.01;
798  /* cout << "R norm\t\tdeltaR\n"; */
799 
800  bool condition = true;
801  while (condition)
802  {
803  float best_corr = -1;
804  float corr;
805  // correlation of PhiA with residual
806  // pick most correlated (store index)
807  for (int i = 0; i < PhiA.rows(); ++i)
808  {
809  if (!in(sel_idx,i))
810  {
811  corr = pearson_correlation(R.array(), PhiA.row(i).array());
812  /* cout << "corr( " << i << "): " << corr << "\n"; */
813  if (corr > best_corr)
814  {
815  best_corr_idx = i;
816  best_corr = corr;
817  }
818  }
819  }
820  if (best_corr > 0)
821  {
822  /* cout << "best_corr_idx: " << best_corr_idx << ", R^2: "
823  * << best_corr << "\n"; */
824  // least squares coefficient of phi* w.r.t. R
825  float b = (covariance(PhiA.row(best_corr_idx),R) /
826  variance(PhiA.row(best_corr_idx)));
827  /* cout << "b: " << b << "\n"; */
828  /* cout << "b*phi: " << b*PhiA.row(best_corr_idx) << "\n"; */
829  deltaR = R.norm();
830  R = R - b*PhiA.row(best_corr_idx).transpose();
831  deltaR = (deltaR - R.norm()) / deltaR;
832  /* cout << "R: " << R.transpose() << "\n"; */
833  /* cout << R.norm() << "\t\t" << deltaR << "\n"; */
834  // select best correlation index
835  if (!params.stagewise_xo_tol || deltaR >= tol)
836  {
837  sel_idx.push_back(best_corr_idx);
838  }
839  }
840  ++nsel;
841  /* if (deltaR < tol) */
842  /* { */
843  /* cout << "!!!!!!!!!!!!!!!\n!!!!!!!!!!!!!!!!!\n!!!!!!!!!!!!!!" */
844  /* << "\nHAH! I caught you, fiend!\n" */
845  /* << deltaR << " < " << tol << "\n"; */
846 
847  /* } */
848  if (params.stagewise_xo_tol)
849  {
850  condition = (deltaR > tol
851  && nsel <= (mom.Phi.rows() + dad.Phi.rows()));
852  }
853  else
854  {
855  condition = nsel < mom.Phi.rows() ;
856  }
857  /* cout << "condition: " << condition << "\n"; */
858  }
859 
860  // compose a child from each feature referenced in sel_idx.
861  vector<size_t> mlocs, dlocs; // mom and dad locations for consideration
862 
863  mlocs = mom.program.roots();
864  dlocs = dad.program.roots();
865  /* cout << "mlocs size: " << mlocs.size() << ", dlocs size: "
866  * << dlocs.size() << "\n"; */
867  child.program.clear();
868 
869  for (int idx : sel_idx)
870  {
871  /* cout << "idx: " << idx << "\n"; */
872 
873  if (idx < mom.Phi.rows())
874  {
875  int stop = mlocs.at(idx);
876  // get subtree indices
877  int start = mom.program.subtree(stop);
878  // construct child program
879  /* cout << "adding mom.program (len= " << mom.program.size()
880  * << ") from " << start */
881  /* << " to " << stop << "\n"; */
882  for (unsigned i = start; i <= stop ; ++i)
883  {
884  child.program.push_back(mom.program.at(i)->clone());
885  }
886  }
887  else
888  {
889  int stop = dlocs.at(idx - mom.Phi.rows());
890  // get subtree indices
891  int start = dad.program.subtree(stop);
892  // construct child program
893  /* cout << "adding dad.program (len= " << dad.program.size()
894  * << ") from " << start */
895  /* << " to " << stop << "\n"; */
896  for (unsigned i = start; i <= stop ; ++i)
897  {
898  child.program.push_back(dad.program.at(i)->clone());
899  }
900  }
901 
902  }
903 
904  /* cout << "child program size: " << child.program.size() << "\n"; */
905  /* if (logger.get_log_level() >= 3) */
906  /* print_cross(mom,i1,j1,dad,i2,j2,child,false); */
907 
908  /* // make child program by splicing mom and dad */
909  /* splice_programs(child.program, mom.program, i1, j1, dad.program,
910  * i2, j2 ); */
911 
912  /* if (logger.get_log_level() >= 3) */
913  /* print_cross(mom,i1,j1,dad,i2,j2,child,true); */
914 
915  /* cout << "asserting validity\n"; */
916  assert(child.program.is_valid_program(params.num_features,
917  params.longitudinalMap));
918  /* cout << "returning \n"; */
919  // check child depth and dimensionality
920  return child.size()>0 && child.size() <= params.max_size
921  && child.get_dim() <= params.max_dim;
922 
923 }
924 // swap vector subsets with different sizes.
926  const NodeVector& v1, size_t i1, size_t j1,
927  const NodeVector& v2, size_t i2, size_t j2)
928 {
942  logger.log("splice_programs",3);
943  if (i1 >= v1.size())
944  cout << "i1 ( " << i1 << ") >= v1 size (" << v1.size() << ")\n";
945  if (i2 >= v2.size())
946  cout << "i2 ( " << i2 << ") >= v2 size (" << v2.size() << ")\n";
947  if (j1+1 < 0)
948  cout << "j1+1 < 0 (j1 = " << j1 << endl;
949  if (j2+1 < 0)
950  cout << "j2+1 < 0 (j2 = " << j2 << endl;
951  // size difference between subtrees
952  // beginning of v1
953  try
954  {
955  for (unsigned i = 0; i < i1 ; ++i)
956  vnew.push_back(v1.at(i)->clone());
957  // spliced in v2 portion
958  for (unsigned i = i2; i <= j2 ; ++i)
959  vnew.push_back(v2.at(i)->clone());
960  // end of v1
961  for (unsigned i = j1+1; i < v1.size() ; ++i)
962  vnew.push_back(v1.at(i)->clone());
963  }
964  catch (std::bad_alloc& ba)
965  {
966  std::cerr << "bad_alloc caught: " << ba.what() << "\n";
967  }
968 }
969 
970 void Variation::print_cross(const Individual& mom, size_t i1, size_t j1,
971  const Individual& dad, size_t i2, size_t j2, Individual& child, bool after)
972 {
973  std::cout << "\t\tattempting the following crossover:\n\t\t";
974  for (int i =0; i<mom.program.size(); ++i){
975  if (i== i1)
976  std::cout << "[";
977  std::cout << mom.program.at(i)->name << " ";
978  if (i==j1)
979  std::cout <<"]";
980  }
981  std::cout << "\n\t\t";
982 
983  for (int i =0; i<dad.program.size(); ++i){
984  if (i== i2)
985  std::cout << "[";
986  std::cout << dad.program.at(i)->name << " ";
987  if (i==j2)
988  std::cout <<"]";
989  }
990  std::cout << "\n\t\t";
991  if (after)
992  {
993  std::cout << "child after cross: ";
994  for (unsigned i = 0; i< child.program.size(); ++i){
995  if (i==i1) std::cout << "[";
996  std::cout << child.program.at(i)->name << " ";
997  if (i==i1+j2-i2) std::cout << "]";
998  }
999  std::cout << "\n";
1000  }
1001 }
1002 
1003 }
1004 }
data holding X, y, and Z data
Definition: data.h:42
VectorXf & y
Definition: data.h:46
MatrixXf & X
Definition: data.h:45
individual programs in the population
Definition: individual.h:31
void set_parents(const vector< Individual > &parents)
set parent ids using parents
Definition: individual.cc:114
int size() const
return size of program
Definition: individual.cc:93
vector< float > p
probability of variation of subprograms
Definition: individual.h:43
VectorXf yhat
current output
Definition: individual.h:35
string get_eqn()
return symbolic representation of program
Definition: individual.cc:748
void clone(Individual &cpy, bool sameid=true) const
clone this individual
Definition: individual.cc:82
MatrixXf Phi
transformation output of program
Definition: individual.h:34
NodeVector program
executable data structure
Definition: individual.h:33
shared_ptr< ML > ml
ML model, trained on Phi.
Definition: individual.h:37
string program_str() const
return program name list
Definition: individual.cc:982
vector< float > get_p() const
get probabilities of variation
Definition: individual.cc:122
void set_id(unsigned i)
Definition: individual.cc:112
unsigned int get_dim()
grab sub-tree locations given starting point.
Definition: individual.cc:873
int get_log_level()
Definition: logger.cc:48
string log(string m, int v, string sep="\n") const
print message with verbosity control.
Definition: logger.cc:54
T random_choice(const vector< T > &v)
Definition: rnd.h:73
float get_cross_rate()
return current cross rate
Definition: variation.cc:20
void print_cross(const Individual &, size_t, size_t, const Individual &, size_t, size_t, Individual &, bool after=true)
debugging printout of crossover operation.
Definition: variation.cc:970
bool cross(const Individual &mom, const Individual &dad, Individual &child, const Parameters &params, const Data &d)
crossover
Definition: variation.cc:551
~Variation()
destructor
Definition: variation.cc:26
void delete_mutate(Individual &child, const Parameters &params)
Definition: variation.cc:359
void splice_programs(NodeVector &vnew, const NodeVector &v1, size_t i1, size_t j1, const NodeVector &v2, size_t i2, size_t j2)
splice two programs together
Definition: variation.cc:925
void vary(Population &pop, const vector< size_t > &parents, const Parameters &params, const Data &d)
method to handle variation of population
Definition: variation.cc:40
bool correlation_delete_mutate(Individual &child, MatrixXf Phi, const Parameters &params, const Data &d)
Definition: variation.cc:465
bool mutate(const Individual &mom, Individual &child, const Parameters &params, const Data &d)
mutation
Definition: variation.cc:114
void set_cross_rate(float cr)
update cross rate
Definition: variation.cc:14
float cross_rate
fraction of crossover in total variation
Definition: variation.h:96
void insert_mutate(Individual &child, const Parameters &params)
Definition: variation.cc:232
bool stagewise_cross(const Individual &mom, const Individual &dad, Individual &child, const Parameters &params, const Data &d)
stagewise crossover
Definition: variation.cc:743
void delete_dimension_mutate(Individual &child, const Parameters &params)
Definition: variation.cc:437
bool residual_cross(const Individual &mom, const Individual &dad, Individual &child, const Parameters &params, const Data &d)
residual crossover
Definition: variation.cc:645
void point_mutate(Individual &child, const Parameters &params)
Definition: variation.cc:174
#define WARN(err)
Definition: error.h:33
T pop(vector< T > *v)
Definition: auto_backprop.h:49
float covariance(const ArrayXf &x, const ArrayXf &y)
covariance of x and y
Definition: utils.cc:164
float pearson_correlation(const ArrayXf &x, const ArrayXf &y)
the normalized covariance of x and y
Definition: utils.cc:184
static Logger & logger
Definition: logger.h:46
bool in(const vector< T > v, const T &i)
check if element is in vector.
Definition: utils.h:47
static Rnd & r
Definition: rnd.h:135
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
float variance(const ArrayXf &v, float mean)
calculate variance when mean provided
Definition: utils.cc:127
std::unique_ptr< Node > random_node(const NodeVector &v)
Definition: variation.cc:28
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
holds the hyperparameters for Feat.
Definition: params.h:25
vector< char > dtypes
data types of input parameters
Definition: params.h:55
unsigned int max_size
max size of programs (length)
Definition: params.h:48
unsigned int max_dim
maximum dimensionality of programs
Definition: params.h:49
bool stagewise_xo_tol
use stagewise crossover
Definition: params.h:71
NodeVector functions
function nodes available in programs
Definition: params.h:42
bool stagewise_xo
use stagewise crossover
Definition: params.h:70
bool residual_xo
use residual crossover
Definition: params.h:69
int pop_size
population size
Definition: params.h:28
vector< float > term_weights
probability weighting of terminals
Definition: params.h:40
unsigned num_features
number of features
Definition: params.h:51
int current_gen
holds current generation
Definition: params.h:30
vector< std::string > longitudinalMap
Definition: params.h:45
float root_xo_rate
crossover
Definition: params.h:73
NodeVector terminals
terminal nodes available in programs vector storing longitudinal data keys
Definition: params.h:43
vector< char > ttypes
program terminal types ('f', 'b')
Definition: params.h:35
vector< char > otypes
program output types ('f', 'b')
Definition: params.h:34
vector< float > op_weights
probability weighting of functions
Definition: params.h:41
bool corr_delete_mutate
use correlation delete mutation
Definition: params.h:72
an extension of a vector of unique pointers to nodes
Definition: nodevector.h:23
vector< size_t > roots() const
returns indices of root nodes
Definition: nodevector.cc:55
void make_program(const NodeVector &functions, const NodeVector &terminals, int max_d, const vector< float > &term_weights, const vector< float > &op_weights, int dim, char otype, vector< string > longitudinalMap, const vector< char > &term_types)
Definition: nodevector.cc:368
bool is_valid_program(unsigned num_features, vector< string > longitudinalMap)
Definition: nodevector.cc:202
void make_tree(const NodeVector &functions, const NodeVector &terminals, int max_d, const vector< float > &term_weights, const vector< float > &op_weights, char otype, const vector< char > &term_types)
Definition: nodevector.cc:243
size_t subtree(size_t i, char otype='0', string indent="> ") const
Definition: nodevector.cc:80
Defines a population of programs and functions for constructing them.
Definition: population.h:28