Feat C++ API
A feature engineering automation tool
n_split.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 #include "n_split.h"
6 
7 namespace FT{
8 namespace Pop{
9 namespace Op{
10 
11  template <>
13  {
14  name = "split";
15  arity['f'] = 1;
16  otype = 'b';
17  complexity = 2;
18  threshold = 0;
19  train = false;
20 
21  }
22 
23  template <>
25  {
26  name = "split_c";
27  arity['c'] = 1;
28  otype = 'b';
29  complexity = 2;
30  threshold = 0;
31  train = false;
32  }
33 
34  #ifndef USE_CUDA
35  template <class T>
36  void NodeSplit<T>::evaluate(const Data& data, State& state)
37  {
38  ArrayXf x1;
39 
40  x1 = state.pop<T>().template cast<float>();
41 
42  if (!data.validation && !data.y.size()==0 && train)
43  set_threshold(x1,data.y, data.classification);
44 
45  if(arity['f'])
46  state.push<bool>(x1 < threshold);
47  else
48  state.push<bool>(x1 == threshold);
49  }
50  #else
51  template <class T>
52  void NodeSplit<T>::evaluate(const Data& data, State& state)
53  {
54  ArrayXf x1(state.N);
55 
56  if(arity['f'])
57  {
58  ArrayXf x(state.N);
59  state.copy_to_host(x.data(), (state.idx['f']-1)*state.N);
60  x1 = x.cast<float>();
61  }
62  else
63  {
64  ArrayXi x(state.N);
65  state.copy_to_host(x.data(), (state.idx['c']-1)*state.N);
66  x1 = x.cast<float>();
67  }
68 
69 
70  if (!data.validation && !data.y.size()==0 && train)
71  set_threshold(x1,data.y, data.classification);
72 
73  if(arity['f'])
74  GPU_Split(state.dev_f, state.dev_b, state.idx['f'],
75  state.idx[otype], state.N, threshold);
76  else
77  GPU_Split(state.dev_c, state.dev_b, state.idx['c'],
78  state.idx[otype], state.N, threshold);
79  }
80  #endif
81 
83  template <class T>
85  {
86  if(arity['f'])
87  state.push<bool>("(" + state.popStr<T>() + "<" +
88  to_string(threshold, 4) + ")");
89  else
90  state.push<bool>("(" + state.popStr<T>() + "==" +
91  to_string(threshold, 4) + ")");
92  }
93 
94  template <class T>
96  return new NodeSplit<T>(*this); };
97 
98  template <class T>
100  return new NodeSplit<T>(); };
101 
102  template <class T>
103  void NodeSplit<T>::set_threshold(ArrayXf& x, VectorXf& y,
104  bool classification)
105  {
106  /* cout << "setting threshold\n"; */
107  // for each unique value in x, calculate the reduction in the
108  // heuristic brought about by
109  // splitting between that value and the next.
110  // set threshold according to the biggest reduction.
111  vector<float> s = unique(x);
112  vector<float> unique_classes = unique(y);
113  vector<int> idx(x.size());
114  std::iota(idx.begin(),idx.end(), 0);
115  Map<ArrayXi> midx(idx.data(),idx.size());
116  float score = 0;
117  float best_score = 0;
118  /* cout << "s: " ; */
119  /* for (auto ss : s) cout << ss << " " ; cout << "\n"; */
120  /* cout << "x: " << x << "\n"; */
121  /* cout << "y: " << y << "\n"; */
122  /* cout << "threshold,score\n"; */
123 
124  for (unsigned i =0; i<s.size()-1; ++i)
125  {
126 
127  float val;
128  ArrayXi split_idx;
129 
130  if(arity['f'])
131  {
132  val = (s.at(i) + s.at(i+1)) / 2;
133  split_idx = (x < val).select(midx,-midx-1);
134  }
135  else
136  {
137  val = s.at(i);
138  split_idx = (x == val).select(midx,-midx-1);
139  }
140 
141  /* cout << "split val: " << val << "\n"; */
142 
143  // split data
144  vector<float> d1, d2;
145  for (unsigned j=0; j< split_idx.size(); ++j)
146  {
147  if (split_idx(j) <0)
148  d2.push_back(y(-1-split_idx(j)));
149  else
150  d1.push_back(y(split_idx(j)));
151  }
152  if (d1.empty() || d2.empty())
153  continue;
154 
155  Map<VectorXf> map_d1(d1.data(), d1.size());
156  Map<VectorXf> map_d2(d2.data(), d2.size());
157  /* cout << "d1: " << map_d1.transpose() << "\n"; */
158  /* cout << "d2: " << map_d2.transpose() << "\n"; */
159  score = gain(map_d1, map_d2, classification,
160  unique_classes);
161  /* cout << "score: " << score << "\n"; */
162  if (score < best_score || i == 0)
163  {
164  best_score = score;
165  threshold = val;
166  }
167  /* cout << val << "," << score << "\n"; */
168  }
169 
171  0 : std::isnan(threshold)?
172  0 : threshold;
173 
174  /* cout << "final threshold set to " << threshold */
175  /* << " with score " */
176  /* << best_score << "\n"; */
177  }
178 
179  template <class T>
180  float NodeSplit<T>::gain(const VectorXf& lsplit,
181  const VectorXf& rsplit,
182  bool classification, vector<float> unique_classes)
183  {
184  float lscore, rscore, score;
185  if (classification)
186  {
187  lscore = gini_impurity_index(lsplit, unique_classes);
188  rscore = gini_impurity_index(rsplit, unique_classes);
189  /* cout << "lscore: " << lscore << "\n"; */
190  /* cout << "rscore: " << rscore << "\n"; */
191  score = (lscore*float(lsplit.size()) +
192  rscore*float(rsplit.size()))
193  /(float(lsplit.size()) + float(rsplit.size()));
194  }
195  else
196  {
197  lscore = variance(lsplit.array())/float(lsplit.size());
198  rscore = variance(rsplit.array())/float(rsplit.size());
199  score = lscore + rscore;
200  }
201 
202  return score;
203  }
204 
205  template <class T>
206  float NodeSplit<T>::gini_impurity_index(const VectorXf& classes,
207  vector<float> uc)
208  {
209  VectorXf class_weights(uc.size());
210  for (auto c : uc){
211  class_weights(static_cast<Eigen::Index>(c)) = 0;
212  class_weights(static_cast<Eigen::Index>(c)) = float(
213  (classes.cast<int>().array() == int(c)).count()
214  )/classes.size();
215  /* cout << "class_weights for " << c << ": "
216  * << class_weights(c) << "\n"; */
217  }
218  /* float total_weight=class_weights.sum(); */
219  float gini = 1 - class_weights.dot(class_weights);
220 
221  return gini;
222  }
223 }
224 }
225 }
data holding X, y, and Z data
Definition: data.h:42
VectorXf & y
Definition: data.h:46
bool classification
Definition: data.h:48
bool validation
Definition: data.h:49
NodeSplit * rnd_clone_impl() const override
Definition: n_split.cc:99
NodeSplit * clone_impl() const override
Definition: n_split.cc:95
float gain(const VectorXf &lsplit, const VectorXf &rsplit, bool classification=false, vector< float > unique_classes=vector< float >())
returns the gain of a split
Definition: n_split.cc:180
float gini_impurity_index(const VectorXf &classes, vector< float > uc)
gini impurity of classes in classes
Definition: n_split.cc:206
void evaluate(const Data &data, State &state)
Evaluates the node and updates the state states.
Definition: n_split.cc:36
void set_threshold(ArrayXf &x, VectorXf &y, bool classification)
Uses a heuristic to set a splitting threshold.
Definition: n_split.cc:103
void eval_eqn(State &state)
Evaluates the node symbolically.
Definition: n_split.cc:84
void GPU_Split(float *xf, bool *xb, size_t idxf, size_t idxb, size_t N, float threshold)
ArrayXb isinf(const ArrayXf &x)
returns true for elements of x that are infinite
Definition: utils.cc:217
ArrayXb isnan(const ArrayXf &x)
returns true for elements of x that are NaN
Definition: utils.cc:226
vector< T > unique(vector< T > w)
returns unique elements in vector
Definition: utils.h:336
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
float variance(const ArrayXf &v, float mean)
calculate variance when mean provided
Definition: utils.cc:127
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
contains various types of State actually used by feat
Definition: state.h:102
string popStr()
Definition: state.h:143
Eigen::Array< T, Eigen::Dynamic, 1 > pop()
Definition: state.h:128
void push(Eigen::Array< T, Eigen::Dynamic, 1 > value)
Definition: state.h:123