Feat C++ API
A feature engineering automation tool
data.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "data.h"
7 #include "../util/rnd.h"
8 #include "../util/logger.h"
9 
10 //#include "node/node.h"
11 //external includes
12 
13 namespace FT{
14 
15  using namespace Util;
16 
17  namespace Dat{
18 
19  Data::Data(MatrixXf& X, VectorXf& y, LongData& Z, bool c,
20  vector<bool> protect):
21  X(X), y(y), Z(Z), classification(c) , protect(protect)
22  {
23  validation=false;
25  if (X.size() != 0)
27  }
28 
30  {
31  this->cases.resize(0);
33  // store levels of protected attributes in X
34  if (!protect.empty())
35  {
36  logger.log("storing protected attributes...",2);
37  for (int i = 0; i < protect.size(); ++i)
38  {
39  if (protect.at(i))
40  {
41  protect_levels[i] = unique(VectorXf(X.row(i)));
42  protected_groups.push_back(i);
43  group_intersections += protect_levels.at(i).size();
44  }
45  }
46  for (auto pl : protect_levels)
47  {
48  int group = pl.first;
49  logger.log("\tfeature " + to_string( group) + ":"
50  + to_string(pl.second.size()) + " values; ",3);
51  }
52  // if there aren't that many group interactions, we might as
53  // well enumerate them to save time during execution.
54  if (group_intersections < 100)
55  {
56  logger.log("storing group intersections...",3);
57  for (auto pl : protect_levels)
58  {
59  int group = pl.first;
60  for (auto level : pl.second)
61  {
62  ArrayXb x = (X.row(group).array() == level);
63  this->cases.push_back(x);
64  /* cout << "new case with : " << x.count() */
65  /* << "samples\n"; */
66  }
67 
68  }
69  logger.log("stored " + to_string(this->cases.size())
70  +" cases",3);
71  }
72  /* else */
73  /* cout << "there are " << group_intersections */
74  /* << " group intersections, so not storing\n"; */
75  }
76  }
78 
79  void Data::get_batch(Data &db, int batch_size) const
80  {
81 
82  batch_size = std::min(batch_size,int(y.size()));
83  if (batch_size < 1)
84  WARN("batch_size is set to "
85  + to_string(batch_size) + " when getting batch");
86 
87  vector<size_t> idx(y.size());
88  std::iota(idx.begin(), idx.end(), 0);
89  // r.shuffle(idx.begin(), idx.end());
90  db.X.resize(X.rows(),batch_size);
91  db.y.resize(batch_size);
92  for (const auto& val: Z )
93  {
94  db.Z[val.first].first.resize(batch_size);
95  db.Z[val.first].second.resize(batch_size);
96  }
97  for (unsigned i = 0; i<batch_size; ++i)
98  {
99 
100  db.X.col(i) = X.col(idx.at(i));
101  db.y(i) = y(idx.at(i));
102 
103  for (const auto& val: Z )
104  {
105  db.Z.at(val.first).first.at(i) = \
106  Z.at(val.first).first.at(idx.at(i));
107  db.Z.at(val.first).second.at(i) = \
108  Z.at(val.first).second.at(idx.at(i));
109  }
110  }
112  }
113 
115  {
116  oCreated = false;
117  tCreated = false;
118  vCreated = false;
119  }
120 
121  DataRef::DataRef(MatrixXf& X, VectorXf& y,
122  LongData& Z, bool c, vector<bool> protect)
123  {
124  this->init(X, y, Z, c, protect);
125  }
126 
127  void DataRef::init(MatrixXf& X, VectorXf& y,
128  LongData& Z, bool c, vector<bool> protect)
129  {
130  o = new Data(X, y, Z, c, protect);
131  oCreated = true;
132 
133  t = new Data(X_t, y_t, Z_t, c, protect);
134  tCreated = true;
135 
136  v = new Data(X_v, y_v, Z_v, c, protect);
137  vCreated = true;
138 
139  classification = c;
140 
141  // split data into training and test sets
142  //train_test_split(params.shuffle, params.split);
143  }
144 
146  {
147  if(o != NULL && oCreated)
148  {
149  delete(o);
150  o = NULL;
151  }
152 
153  if(t != NULL && tCreated)
154  {
155  delete(t);
156  t = NULL;
157  }
158 
159  if(v != NULL && vCreated)
160  {
161  delete(v);
162  v = NULL;
163  }
164  }
165 
166  void DataRef::setOriginalData(MatrixXf& X, VectorXf& y, LongData& Z,
167  bool c, vector<bool> protect)
168  {
169  o = new Data(X, y, Z, c, protect);
170  oCreated = true;
171 
172  t = new Data(X_t, y_t, Z_t, c, protect);
173  tCreated = true;
174 
175  v = new Data(X_v, y_v, Z_v, c, protect);
176  vCreated = true;
177 
178  classification = c;
179  }
180 
182  {
183  o = d;
184  oCreated = false;
185 
186  t = new Data(X_t, y_t, Z_t, d->classification, d->protect);
187  tCreated = true;
188 
189  v = new Data(X_v, y_v, Z_v, d->classification, d->protect);
190  vCreated = true;
191 
193  }
194 
195  void DataRef::setTrainingData(MatrixXf& X_t, VectorXf& y_t,
196  LongData& Z_t,
197  bool c, vector<bool> protect)
198  {
199  t = new Data(X_t, y_t, Z_t, c, protect);
200  tCreated = true;
201 
202  classification = c;
203  }
204 
205  void DataRef::setTrainingData(Data *d, bool toDelete)
206  {
207  t = d;
208  if(!toDelete)
209  tCreated = false;
210  else
211  tCreated = true;
212  }
213 
214  void DataRef::setValidationData(MatrixXf& X_v, VectorXf& y_v,
215  LongData& Z_v, bool c,
216  vector<bool> protect)
217  {
218  v = new Data(X_v, y_v, Z_v, c, protect);
219  vCreated = true;
220  }
221 
223  {
224  v = d;
225  vCreated = false;
226  }
227 
229  {
230  Eigen::PermutationMatrix<Dynamic,Dynamic> perm(o->X.cols());
231  perm.setIdentity();
232  r.shuffle(perm.indices().data(),
233  perm.indices().data()+perm.indices().size());
234  /* cout << "X before shuffle: \n"; */
235  /* cout << o->X.transpose() << "\n"; */
236  o->X = o->X * perm; // shuffles columns of X
237 
238  /* cout << "X after shuffle: \n"; */
239  /* cout << o->X.transpose() << "\n"; */
240  // shuffle y too
241  o->y = (o->y.transpose() * perm).transpose() ;
242 
243  if(o->Z.size() > 0)
244  {
245  std::vector<int> zidx(o->y.size());
246  // zidx maps the perm_indices values to their indices,
247  // i.e. the inverse transform
248  for (unsigned i = 0; i < perm.indices().size(); ++i)
249  zidx.at(perm.indices()(i)) = i;
250  /* cout << "zidx :\n"; */
251  /* for (const auto& zi : zidx) */
252  /* cout << zi << "," ; */
253  /* cout << "\n"; */
254  for(auto &val : o->Z)
255  {
256  /* cout << "unshuffled " << val.first << ": \n"; */
257  /* for (unsigned i = 0; i < val.second.first.size(); ++i) */
258  /* { */
259  /* cout << val.second.first.at(i).transpose() << "\n"; */
260  /* } */
261  reorder_longitudinal(val.second.first, zidx);
262  reorder_longitudinal(val.second.second, zidx);
263  /* cout << "shuffled " << val.first << ": \n"; */
264  /* for (unsigned i = 0; i < val.second.first.size(); ++i) */
265  /* { */
266  /* cout << val.second.first.at(i).transpose() << "\n"; */
267  /* } */
268  }
269  }
270  }
271 
272  void DataRef::split_stratified(float split)
273  {
274  logger.log("Stratify split called with initial data size as "
275  + to_string(o->X.cols()), 3);
276 
277  std::map<float, vector<int>> label_indices;
278 
279  //getting indices for all labels
280  for(int x = 0; x < o->y.size(); x++)
281  label_indices[o->y(x)].push_back(x);
282 
283  /* for (const auto& li : label_indices){ */
284  /* cout << "label " << li.first << ":\t"; */
285  /* for (const auto& val : li.second){ */
286  /* cout << val << ", "; */
287  /* } */
288  /* cout << endl; */
289  /* } */
290  std::map<float, vector<int>>::iterator it = label_indices.begin();
291 
292  vector<int> t_indices;
293  vector<int> v_indices;
294 
295  int t_size;
296  int x;
297 
298  for(; it != label_indices.end(); it++)
299  {
300  t_size = ceil(it->second.size()*split);
301 
302  for(x = 0; x < t_size; x++)
303  t_indices.push_back(it->second.at(x));
304 
305  for(; x < it->second.size(); x++)
306  v_indices.push_back(it->second.at(x));
307 
308  logger.log("Label is " + to_string(it->first), 3, "\t");
309  logger.log("Total size = " + to_string(it->second.size()),
310  3, "\t");
311  logger.log("training_size = " + to_string(t_size), 3, "\t");
312  logger.log("verification size = "
313  + to_string((it->second.size() - t_size)), 3, "\n");
314 
315  }
316 
317  X_t.resize(o->X.rows(), t_indices.size());
318  X_v.resize(o->X.rows(), v_indices.size());
319  y_t.resize(t_indices.size());
320  y_v.resize(v_indices.size());
321 
322  sort(t_indices.begin(), t_indices.end());
323 
324  for(int x = 0; x < t_indices.size(); x++)
325  {
326  t->X.col(x) = o->X.col(t_indices.at(x));
327  t->y(x) = o->y(t_indices.at(x));
328 
329  if(o->Z.size() > 0)
330  {
331  for(auto const &val : o->Z)
332  {
333  t->Z[val.first].first.push_back(
334  val.second.first[t_indices.at(x)]);
335  t->Z[val.first].second.push_back(
336  val.second.second[t_indices.at(x)]);
337  }
338  }
339  }
340 
341  sort(v_indices.begin(), v_indices.end());
342 
343  for(int x = 0; x < v_indices.size(); x++)
344  {
345  v->X.col(x) = o->X.col(v_indices.at(x));
346  v->y(x) = o->y(v_indices.at(x));
347 
348  if(o->Z.size() > 0)
349  {
350  for(auto const &val : o->Z)
351  {
352  v->Z[val.first].first.push_back(
353  val.second.first[t_indices.at(x)]);
354  v->Z[val.first].second.push_back(
355  val.second.second[t_indices.at(x)]);
356  }
357  }
358  }
359 
360  }
361 
362  void DataRef::train_test_split(bool shuffle, float split)
363  {
364  /* @param X: n_features x n_samples matrix of training data
365  * @param y: n_samples vector of training labels
366  * @param shuffle: whether or not to shuffle X and y
367  * @param[out] X_t, X_v, y_t, y_v: training and validation matrices
368  */
369 
370  if (shuffle) // generate shuffle index for the split
371  shuffle_data();
372 
373  if(classification)
374  split_stratified(split);
375  else
376  {
377  int train_size = min(int(o->X.cols()*split),
378  int(o->X.cols()-1));
379  int val_size = max(int(o->X.cols()*(1-split)), 1);
380  // resize training and test sets
381  X_t.resize(o->X.rows(),train_size);
382  X_v.resize(o->X.rows(),val_size);
383  y_t.resize(train_size);
384  y_v.resize(val_size);
385 
386  // map training and test sets
387  t->X = MatrixXf::Map(o->X.data(),t->X.rows(),
388  t->X.cols());
389  v->X = MatrixXf::Map(o->X.data()+t->X.rows()*t->X.cols(),
390  v->X.rows(),v->X.cols());
391 
392  t->y = VectorXf::Map(o->y.data(),t->y.size());
393  v->y = VectorXf::Map(o->y.data()+t->y.size(),v->y.size());
394  if(o->Z.size() > 0)
395  split_longitudinal(o->Z, t->Z, v->Z, split);
396  }
399  }
400 
402  LongData &Z_v, float split)
403  {
404 
405  int size;
406  for ( const auto val: Z )
407  {
408  size = Z.at(val.first).first.size();
409  break;
410  }
411 
412  int testSize = int(size*split);
413  int validateSize = int(size*(1-split));
414 
415  for ( const auto &val: Z )
416  {
417  vector<ArrayXf> _Z_t_v, _Z_t_t, _Z_v_v, _Z_v_t;
418  _Z_t_v.assign(Z[val.first].first.begin(),
419  Z[val.first].first.begin()+testSize);
420  _Z_t_t.assign(Z[val.first].second.begin(),
421  Z[val.first].second.begin()+testSize);
422  _Z_v_v.assign(Z[val.first].first.begin()+testSize,
423  Z[val.first].first.begin()+testSize+validateSize);
424  _Z_v_t.assign(Z[val.first].second.begin()+testSize,
425  Z[val.first].second.begin()+testSize+validateSize);
426 
427  Z_t[val.first] = make_pair(_Z_t_v, _Z_t_t);
428  Z_v[val.first] = make_pair(_Z_v_v, _Z_v_t);
429  }
430  }
431 
432  void DataRef::reorder_longitudinal(vector<ArrayXf> &v,
433  vector<int> const &order )
434  {
435  for ( int s = 1, d; s < order.size(); ++ s ) {
436  for ( d = order.at(s); d < s; d = order.at(d) ) ;
437  if (d == s)
438  while ( d = order.at(d), d != s )
439  swap( v.at(s), v.at(d));
440  }
441  }
442  }
443 }
void init(MatrixXf &X, VectorXf &y, LongData &Z, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:127
VectorXf y_t
Definition: data.h:82
bool classification
Definition: data.h:87
MatrixXf X_v
Definition: data.h:81
void split_longitudinal(LongData &Z, LongData &Z_t, LongData &Z_v, float split)
Definition: data.cc:401
MatrixXf X_t
Definition: data.h:80
void split_stratified(float split)
split classification data as stratas
Definition: data.cc:272
void shuffle_data()
shuffles original data
Definition: data.cc:228
void setOriginalData(MatrixXf &X, VectorXf &y, LongData &Z, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:166
void setTrainingData(MatrixXf &X_t, VectorXf &y_t, LongData &Z_t, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:195
Data * t
Definition: data.h:93
VectorXf y_v
Definition: data.h:83
LongData Z_t
Definition: data.h:84
bool oCreated
Definition: data.h:76
Data * v
Definition: data.h:92
bool tCreated
Definition: data.h:77
bool vCreated
Definition: data.h:78
Data * o
Definition: data.h:91
void train_test_split(bool shuffle, float split)
splits data into training and validation folds.
Definition: data.cc:362
LongData Z_v
Definition: data.h:85
void reorder_longitudinal(vector< ArrayXf > &vec1, const vector< int > &order)
reordering utility for shuffling longitudinal data.
Definition: data.cc:432
void setValidationData(MatrixXf &X_v, VectorXf &y_v, LongData &Z_v, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:214
data holding X, y, and Z data
Definition: data.h:42
vector< int > protected_groups
Definition: data.h:62
vector< ArrayXb > cases
Definition: data.h:64
int group_intersections
Definition: data.h:63
VectorXf & y
Definition: data.h:46
Data(MatrixXf &X, VectorXf &y, LongData &Z, bool c=false, vector< bool > protect=vector< bool >())
Definition: data.cc:19
void get_batch(Data &db, int batch_size) const
select random subset of data for training weights.
Definition: data.cc:79
bool classification
Definition: data.h:48
vector< bool > protect
Definition: data.h:50
map< int, vector< float > > protect_levels
Definition: data.h:61
LongData & Z
Definition: data.h:47
bool validation
Definition: data.h:49
MatrixXf & X
Definition: data.h:45
void set_validation(bool v=true)
Definition: data.cc:77
void set_protected_groups()
Definition: data.cc:29
string log(string m, int v, string sep="\n") const
print message with verbosity control.
Definition: logger.cc:54
void shuffle(RandomAccessIterator first, RandomAccessIterator last)
Definition: rnd.h:53
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition: data.h:21
std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > LongData
Definition: data.h:23
#define WARN(err)
Definition: error.h:33
static Logger & logger
Definition: logger.h:46
vector< T > unique(vector< T > w)
returns unique elements in vector
Definition: utils.h:336
static Rnd & r
Definition: rnd.h:135
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552