Feat C++ API
A feature engineering automation tool
n_fuzzy_split.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 #include "n_fuzzy_split.h"
6 
7 namespace FT{
8 
9  namespace Pop{
10  namespace Op{
11 
12  template <>
14  {
15  name = "fuzzy_split";
16  arity['f'] = 1;
17  otype = 'b';
18  complexity = 2;
19  threshold = 0;
20  train = false;
21  }
22 
23  template <>
25  {
26  name = "fuzzy_split_c";
27  arity['c'] = 1;
28  otype = 'b';
29  complexity = 2;
30  threshold = 0;
31  train = false;
32  }
33 
34  #ifndef USE_CUDA
35  template <class T>
36  void NodeFuzzySplit<T>::evaluate(const Data& data, State& state)
37  {
38  ArrayXf x1;
39 
40  x1 = state.pop<T>().template cast<float>();
41 
42  if (!data.validation && !data.y.size()==0 && train)
43  set_threshold(x1,data.y, data.classification);
44 
45  if(arity['f'])
46  state.push<bool>(x1 < threshold);
47  else
48  state.push<bool>(x1 == threshold);
49  }
50  #else
51  template <class T>
52  void NodeFuzzySplit<T>::evaluate(const Data& data, State& state)
53  {
54  ArrayXf x1(state.N);
55 
56  if(arity['f'])
57  {
58  ArrayXf x(state.N);
59  state.copy_to_host(x.data(), (state.idx['f']-1)*state.N);
60  x1 = x.cast<float>();
61  }
62  else
63  {
64  ArrayXi x(state.N);
65  state.copy_to_host(x.data(), (state.idx['c']-1)*state.N);
66  x1 = x.cast<float>();
67  }
68 
69 
70  if (!data.validation && !data.y.size()==0 && train)
71  set_threshold(x1,data.y, data.classification);
72 
73  if(arity['f'])
74  GPU_FuzzySplit(state.dev_f, state.dev_b, state.idx['f'],
75  state.idx[otype], state.N, threshold);
76  else
77  GPU_FuzzySplit(state.dev_c, state.dev_b, state.idx['c'],
78  state.idx[otype], state.N, threshold);
79  }
80  #endif
81 
83  template <class T>
85  {
86  if(arity['f'])
87  state.push<bool>("(" + state.popStr<T>() + "<" +
88  to_string(threshold, 4) + ")");
89  else
90  state.push<bool>("(" + state.popStr<T>() + "==" +
91  to_string(threshold, 4) + ")");
92  }
93 
94  template <class T>
96  return new NodeFuzzySplit<T>(*this); };
97 
98  template <class T>
100  return new NodeFuzzySplit<T>(); };
101 
102  template <class T>
103  void NodeFuzzySplit<T>::set_threshold(ArrayXf& x, VectorXf& y,
104  bool classification)
105  {
106  /* cout << "setting threshold\n"; */
107  // for each unique value in x, calculate the reduction in the
108  // heuristic brought about by
109  // splitting between that value and the next.
110  // set threshold according to the biggest reduction.
111  vector<float> s;
112  for (unsigned i = 0; i < x.size(); ++i)
113  s.push_back(x(i)); //(x.data(),x.size());
114 
115  vector<float> unique_classes = unique(y);
116  vector<int> idx(s.size());
117  std::iota(idx.begin(),idx.end(), 0);
118  Map<ArrayXi> midx(idx.data(),idx.size());
119  s = unique(s);
120  if (s.size() == 1)
121  {
122  // if there is only one value, just set the threshold to
123  // that
124  threshold = s.at(0);
125  return;
126  }
127  float score = 0;
128  float best_score = 0;
129  vector<float> neg_scores; // holds all scores for sampling
130  vector<float> thresholds; // holds all scores for sampling
131  /* cout << "s: " ; */
132  /* for (auto ss : s) cout << ss << " " ; cout << "\n"; */
133  /* cout << "x: " << x << "\n"; */
134  /* cout << "y: " << y << "\n"; */
135  /* cout << "threshold,score\n"; */
136 
137  for (unsigned i =0; i<s.size()-1; ++i)
138  {
139 
140  float val;
141  ArrayXi split_idx;
142 
143  if(arity['f'])
144  {
145  val = (s.at(i) + s.at(i+1)) / 2;
146  split_idx = (x < val).select(midx,-midx-1);
147  }
148  else
149  {
150  val = s.at(i);
151  split_idx = (x == val).select(midx,-midx-1);
152  }
153  // split data
154  vector<float> d1, d2;
155  for (unsigned j=0; j< split_idx.size(); ++j)
156  {
157  if (split_idx(j) <0)
158  d2.push_back(y(-1-split_idx(j)));
159  else
160  d1.push_back(y(split_idx(j)));
161  }
162  if (d1.empty() || d2.empty())
163  continue;
164 
165  Map<VectorXf> map_d1(d1.data(), d1.size());
166  Map<VectorXf> map_d2(d2.data(), d2.size());
167  /* cout << "d1: " << map_d1.transpose() << "\n"; */
168  /* cout << "d2: " << map_d2.transpose() << "\n"; */
169  score = gain(map_d1, map_d2, classification,
170  unique_classes);
171 
172  neg_scores.push_back(-score);
173  thresholds.push_back(val);
174  /* cout << "score: " << score << "\n"; */
175  /* cout << val << "," << score << "\n"; */
176  if (score < best_score || i == 0)
177  {
178  best_score = score;
179  }
180  }
181  if (thresholds.empty())
182  {
183  /* cout << "threshold set to zero\n"; */
184  threshold = 0;
185  return;
186  }
187  else
188  {
189  // choose a random threshold weighted by the scores
190  threshold = r.random_choice(thresholds, neg_scores);
191  int index = distance(thresholds.begin(),
192  find(thresholds.begin(), thresholds.end(),
193  threshold));
194  /* cout << "index: " << index << "\n"; */
195  /* cout << "final threshold set to " << threshold */
196  /* << " with score " << -neg_scores.at(index)<< "\n"; */
197  }
198  }
199 
200  template <class T>
201  float NodeFuzzySplit<T>::gain(const VectorXf& lsplit,
202  const VectorXf& rsplit,
203  bool classification, vector<float> unique_classes)
204  {
205  float lscore, rscore, score;
206  if (classification)
207  {
208  lscore = gini_impurity_index(lsplit, unique_classes);
209  rscore = gini_impurity_index(rsplit, unique_classes);
210  /* cout << "lscore: " << lscore << "\n"; */
211  /* cout << "rscore: " << rscore << "\n"; */
212  score = (lscore*float(lsplit.size()) +
213  rscore*float(rsplit.size()))
214  /(float(lsplit.size()) + float(rsplit.size()));
215  }
216  else
217  {
218  lscore = variance(lsplit.array())/float(lsplit.size());
219  rscore = variance(rsplit.array())/float(rsplit.size());
220  score = lscore + rscore;
221  }
222 
223  return score;
224  }
225 
226  template <class T>
228  const VectorXf& classes, vector<float> uc)
229  {
230  VectorXf class_weights(uc.size());
231  for (auto c : uc){
232  class_weights(static_cast<Eigen::Index>(c)) = 0;
233  class_weights(static_cast<Eigen::Index>(c)) = float(
234  (classes.cast<int>().array() == int(c)).count()
235  )/classes.size();
236  /* cout << "class_weights for " << c << ": "
237  * << class_weights(c) << "\n"; */
238  }
239  /* float total_weight=class_weights.sum(); */
240  float gini = 1 - class_weights.dot(class_weights);
241 
242  return gini;
243  }
244  }
245  }
246 }
data holding X, y, and Z data
Definition: data.h:42
VectorXf & y
Definition: data.h:46
bool classification
Definition: data.h:48
bool validation
Definition: data.h:49
void eval_eqn(State &state)
Evaluates the node symbolically.
void evaluate(const Data &data, State &state)
Evaluates the node and updates the state states.
NodeFuzzySplit * clone_impl() const override
void set_threshold(ArrayXf &x, VectorXf &y, bool classification)
Uses a heuristic to set a splitting threshold.
float gini_impurity_index(const VectorXf &classes, vector< float > uc)
gini impurity of classes in classes
NodeFuzzySplit * rnd_clone_impl() const override
float gain(const VectorXf &lsplit, const VectorXf &rsplit, bool classification=false, vector< float > unique_classes=vector< float >())
returns the gain of a split
T random_choice(const vector< T > &v)
Definition: rnd.h:73
vector< T > unique(vector< T > w)
returns unique elements in vector
Definition: utils.h:336
static Rnd & r
Definition: rnd.h:135
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
float variance(const ArrayXf &v, float mean)
calculate variance when mean provided
Definition: utils.cc:127
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
contains various types of State actually used by feat
Definition: state.h:102
string popStr()
Definition: state.h:143
Eigen::Array< T, Eigen::Dynamic, 1 > pop()
Definition: state.h:128
void push(Eigen::Array< T, Eigen::Dynamic, 1 > value)
Definition: state.h:123