Feat C++ API
A feature engineering automation tool
n_fuzzy_fixed_split.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 #include "n_fuzzy_fixed_split.h"
6 
7 namespace FT{
8 
9  namespace Pop{
10  namespace Op{
11 
12  template <>
14  {
15  name = "fuzzy_fixed_split";
16  arity['f'] = 1;
17  otype = 'b';
18  complexity = 2;
19  threshold = 0;
20  threshold_set = false;
21  train = false;
22 
23  }
24 
25  template <>
27  {
28  name = "fuzzy_fixed_split_c";
29  arity['c'] = 1;
30  otype = 'b';
31  complexity = 2;
32  threshold = 0;
33  threshold_set = false;
34  train = false;
35  }
36 
37  #ifndef USE_CUDA
38  template <class T>
39  void NodeFuzzyFixedSplit<T>::evaluate(const Data& data, State& state)
40  {
41  ArrayXf x1;
42 
43  x1 = state.pop<T>().template cast<float>();
44 
45  if (!data.validation
46  && !data.y.size()==0
47  && train
48  && !threshold_set)
49  {
50  set_threshold(x1,data.y, data.classification);
51  threshold_set = true;
52  }
53 
54  if(arity['f'])
55  state.push<bool>(x1 < threshold);
56  else
57  state.push<bool>(x1 == threshold);
58  }
59  #else
60  template <class T>
61  void NodeFuzzyFixedSplit<T>::evaluate(const Data& data, State& state)
62  {
63  ArrayXf x1(state.N);
64 
65  if(arity['f'])
66  {
67  ArrayXf x(state.N);
68  state.copy_to_host(x.data(), (state.idx['f']-1)*state.N);
69  x1 = x.cast<float>();
70  }
71  else
72  {
73  ArrayXi x(state.N);
74  state.copy_to_host(x.data(), (state.idx['c']-1)*state.N);
75  x1 = x.cast<float>();
76  }
77 
78 
79  if (!data.validation && !data.y.size()==0 && train)
80  set_threshold(x1,data.y, data.classification);
81 
82  if(arity['f'])
83  GPU_FuzzyFixedSplit(state.dev_f, state.dev_b, state.idx['f'],
84  state.idx[otype], state.N, threshold);
85  else
86  GPU_FuzzyFixedSplit(state.dev_c, state.dev_b, state.idx['c'],
87  state.idx[otype], state.N, threshold);
88  }
89  #endif
90 
92  template <class T>
94  {
95  if(arity['f'])
96  state.push<bool>("(" + state.popStr<T>() + "<" +
97  to_string(threshold, 4) + ")");
98  else
99  state.push<bool>("(" + state.popStr<T>() + "==" +
100  to_string(threshold, 4) + ")");
101  }
102 
103  template <class T>
105  return new NodeFuzzyFixedSplit<T>(*this); };
106 
107  template <class T>
109  return new NodeFuzzyFixedSplit<T>(); };
110 
111  template <class T>
112  void NodeFuzzyFixedSplit<T>::set_threshold(ArrayXf& x, VectorXf& y,
113  bool classification)
114  {
115  /* cout << "setting threshold\n"; */
116  // for each unique value in x, calculate the reduction in the
117  // heuristic brought about by
118  // splitting between that value and the next.
119  // set threshold according to the biggest reduction.
120  vector<float> s;
121  for (unsigned i = 0; i < x.size(); ++i)
122  s.push_back(x(i)); //(x.data(),x.size());
123 
124  vector<float> unique_classes = unique(y);
125  vector<int> idx(s.size());
126  std::iota(idx.begin(),idx.end(), 0);
127  Map<ArrayXi> midx(idx.data(),idx.size());
128  s = unique(s);
129  /* cout << "unique values of x: "; */
130  /* for (auto si : s) cout << si << ", "; */
131  /* cout << "\n"; */
132  if (s.size() == 1)
133  {
134  // if there is only one value, just set the threshold to
135  // that
136  threshold = s.at(0);
137  return;
138  }
139  float score = 0;
140  float best_score = 0;
141  vector<float> neg_scores; // holds all scores for sampling
142  vector<float> thresholds; // holds all scores for sampling
143  /* cout << "s: " ; */
144  /* for (auto ss : s) cout << ss << " " ; cout << "\n"; */
145  /* cout << "x: " << x << "\n"; */
146  /* cout << "y: " << y << "\n"; */
147  /* cout << "threshold,score\n"; */
148 
149  for (unsigned i =0; i<s.size()-1; ++i)
150  {
151 
152  float val;
153  ArrayXi split_idx;
154 
155  if(arity['f'])
156  {
157  val = (s.at(i) + s.at(i+1)) / 2;
158  /* cout << "val: " << val << "\n"; */
159  split_idx = (x < val).select(midx,-midx-1);
160  }
161  else
162  {
163  val = s.at(i);
164  split_idx = (x == val).select(midx,-midx-1);
165  }
166  // split data
167  vector<float> d1, d2;
168  for (unsigned j=0; j< split_idx.size(); ++j)
169  {
170  if (split_idx(j) <0)
171  d2.push_back(y(-1-split_idx(j)));
172  else
173  d1.push_back(y(split_idx(j)));
174  }
175  if (d1.empty() || d2.empty())
176  {
177  /* cout << "d1 size: " << d1.size() */
178  /* << "d2 size: " << d2.size() << "; exiting\n"; */
179  continue;
180  }
181 
182  Map<VectorXf> map_d1(d1.data(), d1.size());
183  Map<VectorXf> map_d2(d2.data(), d2.size());
184  /* cout << "d1: " << map_d1.transpose() << "\n"; */
185  /* cout << "d2: " << map_d2.transpose() << "\n"; */
186  score = gain(map_d1, map_d2, classification,
187  unique_classes);
188 
189  neg_scores.push_back(-score);
190  thresholds.push_back(val);
191  /* cout << val << "," << score << "\n"; */
192  if (score < best_score || i == 0)
193  {
194  best_score = score;
195  }
196  }
197  if (thresholds.empty())
198  {
199  /* cout << "threshold set to zero\n"; */
200  threshold = 0;
201  return;
202  }
203  else
204  {
205  // choose a random threshold weighted by the scores
206  threshold = r.random_choice(thresholds, neg_scores);
207  int index = distance(thresholds.begin(),
208  find(thresholds.begin(), thresholds.end(),
209  threshold));
210  /* cout << "index: " << index << "\n"; */
211  /* cout << "final threshold set to " << threshold */
212  /* << " with score " << -neg_scores.at(index)<< "\n"; */
213  }
214  }
215 
216  template <class T>
217  float NodeFuzzyFixedSplit<T>::gain(const VectorXf& lsplit,
218  const VectorXf& rsplit,
219  bool classification, vector<float> unique_classes)
220  {
221  float lscore, rscore, score;
222  if (classification)
223  {
224  lscore = gini_impurity_index(lsplit, unique_classes);
225  rscore = gini_impurity_index(rsplit, unique_classes);
226  /* cout << "lscore: " << lscore << "\n"; */
227  /* cout << "rscore: " << rscore << "\n"; */
228  score = (lscore*float(lsplit.size()) +
229  rscore*float(rsplit.size()))
230  /(float(lsplit.size()) + float(rsplit.size()));
231  }
232  else
233  {
234  lscore = variance(lsplit.array())/float(lsplit.size());
235  rscore = variance(rsplit.array())/float(rsplit.size());
236  score = lscore + rscore;
237  }
238 
239  return score;
240  }
241 
242  template <class T>
244  const VectorXf& classes, vector<float> uc)
245  {
246  VectorXf class_weights(uc.size());
247  for (auto c : uc){
248  class_weights(static_cast<Eigen::Index>(c)) = 0;
249  class_weights(static_cast<Eigen::Index>(c)) = float(
250  (classes.cast<int>().array() == int(c)).count()
251  )/classes.size();
252  /* cout << "class_weights for " << c << ": "
253  * << class_weights(c) << "\n"; */
254  }
255  /* float total_weight=class_weights.sum(); */
256  float gini = 1 - class_weights.dot(class_weights);
257 
258  return gini;
259  }
260  }
261  }
262 }
data holding X, y, and Z data
Definition: data.h:42
VectorXf & y
Definition: data.h:46
bool classification
Definition: data.h:48
bool validation
Definition: data.h:49
void eval_eqn(State &state)
Evaluates the node symbolically.
void set_threshold(ArrayXf &x, VectorXf &y, bool classification)
Uses a heuristic to set a splitting threshold.
float gain(const VectorXf &lsplit, const VectorXf &rsplit, bool classification=false, vector< float > unique_classes=vector< float >())
returns the gain of a split
void evaluate(const Data &data, State &state)
Evaluates the node and updates the state states.
NodeFuzzyFixedSplit * clone_impl() const override
NodeFuzzyFixedSplit * rnd_clone_impl() const override
float gini_impurity_index(const VectorXf &classes, vector< float > uc)
gini impurity of classes in classes
T random_choice(const vector< T > &v)
Definition: rnd.h:73
vector< T > unique(vector< T > w)
returns unique elements in vector
Definition: utils.h:336
static Rnd & r
Definition: rnd.h:135
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
float variance(const ArrayXf &v, float mean)
calculate variance when mean provided
Definition: utils.cc:127
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
contains various types of State actually used by feat
Definition: state.h:102
string popStr()
Definition: state.h:143
Eigen::Array< T, Eigen::Dynamic, 1 > pop()
Definition: state.h:128
void push(Eigen::Array< T, Eigen::Dynamic, 1 > value)
Definition: state.h:123