Feat C++ API
A feature engineering automation tool
nodevector.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "nodevector.h"
7 using namespace FT::Pop::Op;
8 namespace FT{
9 
10 namespace Pop{
11 
12 NodeVector::NodeVector() = default;
13 
14 NodeVector::~NodeVector() = default;
15 
16 NodeVector::NodeVector(NodeVector && other) = default;
17 
18 NodeVector& NodeVector::operator=(NodeVector && other) = default;
19 
21 {
22  /* std::cout<<"in NodeVector(const NodeVector& other)\n"; */
23  this->resize(0);
24  for (const auto& p : other)
25  this->push_back(p->clone());
26 }
27 
29 {
30 
31  /* std::cout << "in NodeVector& operator=(NodeVector const& other)\n"; */
32  this->resize(0);
33  for (const auto& p : other)
34  this->push_back(p->clone());
35  return *this;
36 }
37 
38 vector<Node*> NodeVector::get_data(int start,int end)
39 {
40  vector<Node*> v;
41  if (end == 0)
42  {
43  if (start == 0)
44  end = this->size();
45  else
46  end = start;
47  }
48  for (unsigned i = start; i<=end; ++i)
49  v.push_back(this->at(i).get());
50 
51  return v;
52 }
53 
55 vector<size_t> NodeVector::roots() const
56 {
57  // find "root" nodes of program, where roots are final values that output
58  // something directly to the state
59  // assumes a program's subtrees to be contiguous
60 
61  vector<size_t> indices; // returned root indices
62  int total_arity = -1; //end node is always a root
63  for (size_t i = this->size(); i>0; --i) // reverse loop thru program
64  {
65  if (total_arity <= 0 ){ // root node
66  indices.push_back(i-1);
67  total_arity=0;
68  }
69  else
70  --total_arity;
71 
72  total_arity += this->at(i-1)->total_arity();
73 
74  }
75 
76  std::reverse(indices.begin(), indices.end());
77  return indices;
78 }
79 
80 size_t NodeVector::subtree(size_t i, char otype, string indent) const
81 {
82 
97  size_t tmp = i;
98  if (i<0 || i > this->size())
99  THROW_LENGTH_ERROR("Attempting got grab subtree with index "
100  + to_string(i) + " and program size "
101  + to_string(this->size()));
102 
103  /* cout << indent << "getting subtree(" << i << "," */
104  /* << otype << ") for " << this->at(i)->name */
105  /* << " of type " << this->at(i)->otype << endl; */
106  // return this index if it is a terminal
107  if (this->at(i)->total_arity()==0)
108  {
109  return i;
110  }
111 
112  std::map<char, unsigned int> arity = this->at(i)->arity;
113 
114  /* cout << indent << "otype: " << otype << endl; */
115 
116  if (otype!='0') // if we are recursing (otype!='0'), we need to find
117  // where the nodes to recurse are.
118  {
119  while (i>0 && this->at(i)->otype != otype) --i;
120 
121  if (this->at(i)->otype != otype)
122  THROW_INVALID_ARGUMENT("invalid subtree arguments");
123  }
124 
125  /* cout << indent << "i at 125: " << i << "\n"; */
126  // recurse for floating arguments
127  for (unsigned int j = 0; j<arity.at('f'); ++j)
128  i = subtree(--i,'f', indent+indent);
129  /* cout << indent << "i at 129: " << i << "\n"; */
130  // recurse for boolean
131  size_t i2 = i;
132  for (unsigned int j = 0; j<arity.at('b'); ++j)
133  i2 = subtree(--i2,'b', indent+indent);
134  /* cout << indent << "i2 at 134: " << i2 << "\n"; */
135  // recurse for categorical arguments
136  size_t i3 = i2;
137  for (unsigned int j = 0; j<arity.at('c'); ++j)
138  i3 = subtree(--i3,'c', indent+indent);
139  /* cout << indent << "i3 at 139: " << i3 << "\n"; */
140  // recurse for longitudinal arguments
141  size_t i4 = i3;
142  for (unsigned int j = 0; j<arity.at('z'); ++j)
143  i4 = subtree(--i4,'z', indent+indent);
144  /* cout << indent << "i4 at 145: " << i4 << "\n"; */
145  /* cout << indent << "returning min(" << i << "," << i4 << ")\n"; */
146 
147  return std::min(i,i4);
148 }
149 
150 void NodeVector::set_weights(vector<vector<float>>& weights)
151 {
152  if (weights.size()==0) return;
153  int count = 0;
154  /* int dx_node_count = 0; */
155  /* for (unsigned i = 0; i< this->size(); ++i) */
156  /* { */
157  /* if (this->at(i)->isNodeDx()) */
158  /* { */
159  /* ++dx_node_count; */
160  /* } */
161  /* } */
162  /* if (weights.size() != dx_node_count) */
163  /* {} */
164  for (unsigned i = 0; i< this->size(); ++i)
165  {
166  if (this->at(i)->isNodeDx())
167  {
168  NodeDx* nd = dynamic_cast<NodeDx*>(this->at(i).get());
169 
170  if (weights.at(count).size() == nd->W.size())
171  nd->W = weights.at(count);
172  else
173  {
174  string error = "mismatch in size btw weights[" +
175  to_string(count) +
176  "] and W\n";
177 
178  error += "weights[" + to_string(count) +
179  "].size() (" + to_string(weights.at(count).size()) +
180  ") != W.size() ("+ to_string(nd->W.size()) + "\n";
181 
182  THROW_LENGTH_ERROR(error);
183  }
184  ++count;
185  }
186  }
187 }
188 
189 vector<vector<float>> NodeVector::get_weights()
190 {
191  vector<vector<float>> weights;
192  for (unsigned i = 0; i< this->size(); ++i)
193  {
194  if (this->at(i)->isNodeDx())
195  {
196  weights.push_back(dynamic_cast<NodeDx*>(this->at(i).get())->W);
197  }
198  }
199  return weights;
200 }
201 
202 bool NodeVector::is_valid_program(unsigned num_features,
203  vector<string> longitudinalMap)
204 {
206  State state;
207 
208  std::map<string, std::pair<vector<ArrayXf>, vector<ArrayXf>>> Z;
209 
210  MatrixXf X = MatrixXf::Zero(num_features,2);
211  VectorXf y = VectorXf::Zero(2);
212 
213  for(auto key : longitudinalMap)
214  {
215  Z[key].first.push_back(ArrayXf::Zero(2));
216  Z[key].first.push_back(ArrayXf::Zero(2));
217  Z[key].second.push_back(ArrayXf::Zero(2));
218  Z[key].second.push_back(ArrayXf::Zero(2));
219  }
220 
221  Data data(X, y, Z, false);
222 
223  unsigned i = 0;
224  for (const auto& n : *this){
225  // learning nodes are set for fit or predict mode
226  if (n->isNodeTrain())
227  dynamic_cast<NodeTrain*>(n.get())->train = false;
228  if (state.check(n->arity))
229  n->evaluate(data, state);
230  else
231  {
232  std::cout << "Error: ";
233  for (const auto& p: *this) std::cout << p->name << " ";
234  std::cout << "is not a valid program because ";
235  std::cout << n->name << " at pos " << i << "is not satisfied\n";
236  return false;
237  }
238  ++i;
239  }
240  return true;
241 }
242 
243 void NodeVector::make_tree(const NodeVector& functions,
244  const NodeVector& terminals, int max_d,
245  const vector<float>& term_weights,
246  const vector<float>& op_weights,
247  char otype, const vector<char>& term_types)
248 {
249 
253  // debugging output
254  /* #ifdef NDEBUG */
255  /* #else */
256  /* // debug code */
257  /* std::cout << "current program: ["; */
258  /* for (const auto& p : *(this) ) std::cout << p->name << " "; */
259  /* std::cout << "]\n"; */
260  /* std::cout << "otype: " << otype << "\n"; */
261  /* std::cout << "max_d: " << max_d << "\n"; */
262  /* #endif */
263 
264  if (max_d == 0 || r.rnd_flt() < terminals.size()/(terminals.size()+functions.size()))
265  {
266  // append terminal
267  vector<size_t> ti; // indices of valid terminals
268  vector<float> tw; // weights of valid terminals
269  /* cout << "terminals: " ; */
270  /* for (const auto& t : terminals) cout << t->name << "(" << t->otype << "),"; */
271  /* cout << "\n"; */
272 
273  for (size_t i = 0; i<terminals.size(); ++i)
274  {
275  if (terminals.at(i)->otype == otype) // grab terminals matching output type
276  {
277  ti.push_back(i);
278  tw.push_back(term_weights.at(i));
279  }
280 
281  }
282  /* cout << "valid terminals: "; */
283  /* for (unsigned i = 0; i < ti.size(); ++i) */
284  /* cout << terminals.at(ti.at(i))->name << "(" << terminals.at(ti.at(i))->otype << ", " */
285  /* << tw.at(i) << "), "; */
286  /* cout << "\n"; */
287 
288  if(ti.size() > 0 && tw.size() > 0)
289  {
290  auto t = terminals.at(r.random_choice(ti,tw))->clone();
291  /* std::cout << "chose " << t->name << " "; */
292  push_back(t->rnd_clone());
293  }
294  else
295  {
296  string ttypes = "";
297  for (const auto& t : terminals)
298  ttypes += t->name + ": " + t->otype + "\n";
299  std::ostringstream msg;
300  msg << "Error: make_tree couldn't find a terminal of "
301  << "type " << otype << ". terminal types:\n" << ttypes ;
302  THROW_RUNTIME_ERROR(msg.str());
303  }
304  }
305  else
306  {
307  // let fi be indices of functions whose output type matches otype and, if max_d==1,
308  // with no boolean inputs (assuming all input data is floating point)
309  vector<size_t> fi;
310  vector<float> fw; // function weights
311  bool fterms = in(term_types, 'f'); // are there floating terminals?
312  bool bterms = in(term_types, 'b'); // are there boolean terminals?
313  bool cterms = in(term_types, 'c'); // are there categorical terminals?
314  bool zterms = in(term_types, 'z'); // are there long terminals?
315  /* std::cout << "fterms: " << fterms << ", bterms: " << bterms */
316  /* << ",cterms: " << cterms */
317  /* << ",zterms: " << zterms << "\n"; */
318  for (size_t i = 0; i<functions.size(); ++i)
319  if (functions.at(i)->otype==otype &&
320  (max_d>1 || functions.at(i)->arity.at('f')==0 || fterms) &&
321  (max_d>1 || functions.at(i)->arity.at('b')==0 || bterms) &&
322  (max_d>1 || functions.at(i)->arity.at('c')==0 || cterms) &&
323  (max_d>1 || functions.at(i)->arity.at('z')==0 || zterms))
324  {
325  fi.push_back(i);
326  fw.push_back(op_weights.at(i));
327  }
328 
329  // if there are no valid functions, add a terminal
330  if (fi.size()==0)
331  {
332  make_tree(functions, terminals, 0, term_weights,
333  op_weights, otype, term_types);
334  return;
335 
336  }
337  if (fi.size() == 0)
338  THROW_RUNTIME_ERROR("The operator set specified "
339  "results in incomplete programs.");
340 
341  // append a random choice from fs
342  /* cout << "function choices: \n"; */
343  /* for (unsigned fis =0; fis < fi.size(); ++fis) */
344  /* cout << "(" << functions[fi[fis]]->name << "," << fw[fis] << ") ,"; */
345  /* cout << "\n"; */
346 
347  push_back(functions.at(r.random_choice(fi,fw))->rnd_clone());
348 
349  /* std::unique_ptr<Node> chosen(back()->clone()); */
350  map<char, unsigned> chosen_arity = back()->arity;
351  // recurse to fulfill the arity of the chosen function
352  vector<char> type_order = {'f','b','c','z'};
353  for (auto type : type_order)
354  {
355  for (size_t i = 0; i < chosen_arity.at(type); ++i)
356  {
357  make_tree(functions, terminals, max_d-1, term_weights,
358  op_weights, type, term_types);
359  }
360 
361  }
362  }
363 
364  /* std::cout << "finished program: ["; */
365  /* for (const auto& p : *(this) ) std::cout << p->name << " "; */
366 }
367 
368 void NodeVector::make_program(const NodeVector& functions,
369  const NodeVector& terminals, int max_d,
370  const vector<float>& term_weights,
371  const vector<float>& op_weights,
372  int dim, char otype,
373  vector<string> longitudinalMap,
374  const vector<char>& term_types)
375 {
376  for (int i = 0; i<dim; ++i) // build trees
377  {
378  make_tree(functions, terminals, max_d, term_weights, op_weights, otype,
379  term_types);
380  }
381 
382  // reverse program so that it is post-fix notation
383  std::reverse(begin(), end());
384  assert(is_valid_program(terminals.size(), longitudinalMap));
385  if (!is_valid_program(terminals.size(), longitudinalMap))
386  {
387  THROW_RUNTIME_ERROR("make_program produced invalid program");
388  }
389 }
390 
391 //serialization
392 void to_json(json& j, const NodeVector& nv)
393 {
394  /* vector<json> program; */
395  for (const auto& n : nv)
396  {
397  json k;
398  // cast different types of nodes
399  if (typeid(*n) == typeid(NodeSplit<float>))
400  Op::to_json(k, *dynamic_cast<NodeSplit<float>*>(n.get()));
401 
402  else if (typeid(*n) == typeid(NodeSplit<int>))
403  Op::to_json(k, *dynamic_cast<NodeSplit<int>*>(n.get()));
404 
405  else if (typeid(*n) == typeid(NodeFuzzySplit<float>))
406  Op::to_json(k, *dynamic_cast<NodeFuzzySplit<float>*>(n.get()));
407 
408  else if (typeid(*n) == typeid(NodeFuzzySplit<int>))
409  Op::to_json(k, *dynamic_cast<NodeFuzzySplit<int>*>(n.get()));
410 
411  else if (typeid(*n) == typeid(NodeFuzzyFixedSplit<float>))
412  Op::to_json(k, *dynamic_cast<NodeFuzzyFixedSplit<float>*>(n.get()));
413 
414  else if (typeid(*n) == typeid(NodeFuzzyFixedSplit<int>))
415  Op::to_json(k, *dynamic_cast<NodeFuzzyFixedSplit<int>*>(n.get()));
416 
417  else if (n->isNodeTrain())
418  Op::to_json(k, *dynamic_cast<NodeTrain*>(n.get()));
419 
420  else if (n->isNodeDx())
421  Op::to_json(k, *dynamic_cast<NodeDx*>(n.get()));
422 
423  else if (typeid(*n) == typeid(NodeVariable<float>))
424  Op::to_json(k, *dynamic_cast<NodeVariable<float>*>(n.get()));
425 
426  else if (typeid(*n) == typeid(NodeVariable<int>))
427  Op::to_json(k, *dynamic_cast<NodeVariable<int>*>(n.get()));
428 
429  else if (typeid(*n) == typeid(NodeVariable<bool>))
430  Op::to_json(k, *dynamic_cast<NodeVariable<bool>*>(n.get()));
431 
432  else if (typeid(*n) == typeid(NodeConstant))
433  Op::to_json(k, *dynamic_cast<NodeConstant*>(n.get()));
434 
435  else
436  Op::to_json(k, *n);
437 
438  j.push_back(k);
439  }
440 
441 }
442 void from_json(const json& j, NodeVector& nv)
443 {
444  for (const auto& k : j)
445  {
446  string node_name = k.at("name").get<string>();
447 
448  if (Op::NM.node_map.find(node_name) == Op::NM.node_map.end())
449  {
450  node_name = k.at("name").get<string>() + "_"
451  + to_string(k.at("otype").get<char>());
452  if (Op::NM.node_map.find(node_name) == Op::NM.node_map.end())
453  THROW_INVALID_ARGUMENT(node_name + " not found");
454  }
455 
456  auto n = NM.node_map[node_name]->clone();
457  // cast different types of nodes
458  if (typeid(*n) == typeid(NodeSplit<float>))
459  Op::from_json(k, *dynamic_cast<NodeSplit<float>*>(n.get()));
460 
461  else if (typeid(*n) == typeid(NodeSplit<int>))
462  Op::from_json(k, *dynamic_cast<NodeSplit<int>*>(n.get()));
463 
464  else if (typeid(*n) == typeid(NodeFuzzySplit<int>))
465  Op::from_json(k, *dynamic_cast<NodeFuzzySplit<int>*>(n.get()));
466 
467  else if (typeid(*n) == typeid(NodeFuzzySplit<float>))
468  Op::from_json(k, *dynamic_cast<NodeFuzzySplit<float>*>(n.get()));
469 
470  else if (typeid(*n) == typeid(NodeFuzzyFixedSplit<float>))
471  Op::from_json(k, *dynamic_cast<NodeFuzzyFixedSplit<float>*>(n.get()));
472 
473  else if (typeid(*n) == typeid(NodeFuzzyFixedSplit<int>))
474  Op::from_json(k, *dynamic_cast<NodeFuzzyFixedSplit<int>*>(n.get()));
475 
476  else if (n->isNodeTrain())
477  Op::from_json(k, *dynamic_cast<NodeTrain*>(n.get()));
478 
479  else if (n->isNodeDx())
480  Op::from_json(k, *dynamic_cast<NodeDx*>(n.get()));
481 
482  else if (typeid(*n) == typeid(NodeVariable<float>))
483  Op::from_json(k, *dynamic_cast<NodeVariable<float>*>(n.get()));
484 
485  else if (typeid(*n) == typeid(NodeVariable<int>))
486  Op::from_json(k, *dynamic_cast<NodeVariable<int>*>(n.get()));
487 
488  else if (typeid(*n) == typeid(NodeVariable<bool>))
489  Op::from_json(k, *dynamic_cast<NodeVariable<bool>*>(n.get()));
490 
491  else if (typeid(*n) == typeid(NodeConstant))
492  Op::from_json(k, *dynamic_cast<NodeConstant*>(n.get()));
493 
494  else
495  Op::from_json(k, *n);
496 
497  // check
498  json k2;
499  nv.push_back(n->clone());
500  /* json k2; */
501  /* Op::to_json(k2, *nv.back()); */
502  /* cout << "after from_json call: " << k2.dump() << endl; */
503 
504  }
505  json check;
506  to_json(check, nv);
507 }
508 
509 } // Pop
510 } // FT
data holding X, y, and Z data
Definition: data.h:42
std::vector< float > W
Definition: n_Dx.h:16
T random_choice(const vector< T > &v)
Definition: rnd.h:73
float rnd_flt(float min=0.0, float max=1.0)
Definition: rnd.cc:83
#define THROW_LENGTH_ERROR(err)
Definition: error.h:32
#define THROW_RUNTIME_ERROR(err)
Definition: error.h:30
#define THROW_INVALID_ARGUMENT(err)
Definition: error.h:31
namespace representing various operations on population individuals used in Feat
Definition: cuda_utils.h:24
static NodeMap NM
Definition: nodemap.h:93
void from_json(const json &j, NodeVector &nv)
Definition: nodevector.cc:442
void to_json(json &j, const NodeVector &nv)
Definition: nodevector.cc:392
bool in(const vector< T > v, const T &i)
check if element is in vector.
Definition: utils.h:47
static Rnd & r
Definition: rnd.h:135
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
contains various types of State actually used by feat
Definition: state.h:102
bool check(std::map< char, unsigned int > &arity)
checks if arity of node provided satisfies the node names in various string State
Definition: state.cc:19
an extension of a vector of unique pointers to nodes
Definition: nodevector.h:23
NodeVector & operator=(NodeVector const &other)
Definition: nodevector.cc:28
vector< size_t > roots() const
returns indices of root nodes
Definition: nodevector.cc:55
vector< Node * > get_data(int start=0, int end=0)
returns vector of raw pointers to nodes in [start,end], or all if both are zero
Definition: nodevector.cc:38
void make_program(const NodeVector &functions, const NodeVector &terminals, int max_d, const vector< float > &term_weights, const vector< float > &op_weights, int dim, char otype, vector< string > longitudinalMap, const vector< char > &term_types)
Definition: nodevector.cc:368
bool is_valid_program(unsigned num_features, vector< string > longitudinalMap)
Definition: nodevector.cc:202
void set_weights(vector< vector< float >> &weights)
Definition: nodevector.cc:150
void make_tree(const NodeVector &functions, const NodeVector &terminals, int max_d, const vector< float > &term_weights, const vector< float > &op_weights, char otype, const vector< char > &term_types)
Definition: nodevector.cc:243
vector< vector< float > > get_weights()
Definition: nodevector.cc:189
size_t subtree(size_t i, char otype='0', string indent="> ") const
Definition: nodevector.cc:80
std::map< std::string, Node * > node_map
Definition: nodemap.h:16