Feat C++ API
A feature engineering automation tool
utils.h
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #ifndef UTILS_H
7 #define UTILS_H
8 
9 #include <Eigen/Dense>
10 #include <vector>
11 #include <fstream>
12 #include <sstream>
13 #include <chrono>
14 #include <ostream>
15 #include <map>
16 #include "../init.h"
17 #include "error.h"
18 #include <shogun/lib/common.h>
19 //#include "data.h"
20 
21 using namespace Eigen;
22 
23 
24 namespace FT{
29 namespace Util{
30 
31 extern string PBSTR;
32 
33 extern int PBWIDTH;
34 
36 void clean(ArrayXf& x);
37 void clean(VectorXf& x);
38 
39 std::string ltrim(std::string str, const std::string& chars = "\t\n\v\f\r ");
40 
41 std::string rtrim(std::string str, const std::string& chars = "\t\n\v\f\r ");
42 
43 std::string trim(std::string str, const std::string& chars = "\t\n\v\f\r ");
44 
46 template<typename T>
47 bool in(const vector<T> v, const T& i)
48 {
49  return std::find(v.begin(), v.end(), i) != v.end();
50 }
51 
53 float median(const ArrayXf& v);
54 
56 float variance(const ArrayXf& v, float mean);
57 
59 float variance(const ArrayXf& v);
60 
62 float skew(const ArrayXf& v);
63 
65 float kurtosis(const ArrayXf& v);
66 
68 float covariance(const ArrayXf& x, const ArrayXf& y);
69 
71 float slope(const ArrayXf& x, const ArrayXf& y);
72 
74 float pearson_correlation(const ArrayXf& x, const ArrayXf& y);
75 
77 float mad(const ArrayXf& x);
78 
80 template <typename T>
81 vector<size_t> argsort(const vector<T> &v, bool ascending=true)
82 {
83  // initialize original index locations
84  vector<size_t> idx(v.size());
85  std::iota(idx.begin(), idx.end(), 0);
86 
87  // sort indexes based on comparing values in v
88  if (ascending)
89  {
90  sort(idx.begin(), idx.end(),
91  [&v](size_t i1, size_t i2) {return v[i1] < v[i2];});
92  }
93  else
94  {
95  sort(idx.begin(), idx.end(),
96  [&v](size_t i1, size_t i2) {return v[i1] > v[i2];});
97  }
98 
99  return idx;
100 }
101 
103 class Timer
104 {
105  typedef std::chrono::high_resolution_clock high_resolution_clock;
106 
107  typedef std::chrono::seconds seconds;
108 
109  public:
110  explicit Timer(bool run = false);
111 
112  void Reset();
113 
114  std::chrono::duration<float> Elapsed() const;
115 
116  template <typename T, typename Traits>
117  friend std::basic_ostream<T, Traits>& operator<<(
118  std::basic_ostream<T, Traits>& out, const Timer& timer)
119  {
120  return out << timer.Elapsed().count();
121  }
122 
123  private:
124  high_resolution_clock::time_point _start;
125 
126 };
127 
129 template <typename T>
130 vector<T> softmax(const vector<T>& w)
131 {
132  int x;
133  T sum = 0;
134  vector<T> w_new;
135 
136  for(x = 0; x < w.size(); ++x)
137  sum += exp(w[x]);
138 
139  for(x = 0; x < w.size(); ++x)
140  w_new.push_back(exp(w[x])/sum);
141 
142  return w_new;
143 }
144 
147 {
148  Normalizer(bool sa=true, bool rm_offset=true)
149  : scale_all(sa)
150  , remove_offset(rm_offset)
151  {};
152 
153  vector<float> scale;
154  vector<float> offset;
155  vector<char> dtypes;
156  bool scale_all;
158 
160  template <typename T>
161  void fit(const MatrixBase<T>& X, const vector<char>& dt)
162  {
163  scale.clear();
164  offset.clear();
165  dtypes = dt;
166  for (unsigned int i=0; i<X.rows(); ++i)
167  {
168  /* tmp = X.row(i); */
169  // mean center
170  if (remove_offset)
171  {
172  /* tmp = tmp.array() - tmp.mean(); */
173  offset.push_back(float(X.row(i).mean()));
174  }
175  else
176  offset.push_back(0.0);
177  /* VectorXf tmp; */
178  // scale by the standard deviation
179  scale.push_back(
180  std::sqrt(
181  (X.row(i).array() - offset.at(i))
182  .square()
183  .sum()/(X.row(i).size()-1)
184  )
185  );
186  }
187 
188  }
190  template <typename T>
191  void normalize(MatrixBase<T>& X) const
192  {
193  // normalize features
194  for (unsigned int i=0; i<X.rows(); ++i)
195  {
196  if (std::isinf(scale.at(i)))
197  {
198  /* X.row(i) = Matrix<T, Dynamic, 1>::Zero(X.row(i).size()); */
199  continue;
200  }
201  // scale, potentially skipping binary and categorical rows
202  if (this->scale_all || dtypes.at(i)=='f')
203  {
204  if (remove_offset)
205  X.row(i) = X.row(i).array() - offset.at(i);
206  if (scale.at(i) > NEAR_ZERO)
207  X.row(i) = X.row(i).array()/scale.at(i);
208  }
209  }
210  }
212  // y = B_norm*X_norm.
213  //
214  template <typename T>
215  void adjust_weights(MatrixBase<T>& B) const
216  {
217  // Transform input, Bnorm, into B by dividing by scale.
218  // normalize features
219  for (unsigned int i=0; i<B.rows(); ++i)
220  {
221  if (std::isinf(scale.at(i)))
222  {
223  continue;
224  }
225  // scale, potentially skipping binary and categorical rows
226  if (this->scale_all || dtypes.at(i)=='f')
227  {
228  if (scale.at(i) > NEAR_ZERO)
229  B.row(i) = B.row(i).array()/scale.at(i);
230  }
231  }
232  }
233 
234  template<typename T>
235  void adjust_weights(shogun::SGVector<T>& B) const
236  {
237  auto tmp_map = Map<Eigen::Matrix<T,Dynamic,1>>(B.data(), B.size());
238  this->adjust_weights(tmp_map);
239  }
240 
241  template<typename T>
242  void adjust_weights(vector<T>& B) const
243  {
244  auto tmp_map = Map<Eigen::Matrix<T,Dynamic,1>>(B.data(), B.size());
245  this->adjust_weights(tmp_map);
246  }
247 
248  template <typename T>
249  float adjust_offset(const MatrixBase<T>& Bn, float init_offset) const
250  {
251  // yn = Bn_0 + Bn_1 * xn_1 + ...
252  // = Bn_0 + Bn_1 * (x-offset)/scale) + ...
253  //-> B_0 = Bn_0 - sum(Bn_i*offset_i/scale_i)
254  /* ArrayXf Bn = B.cast <float> (); */
255  float adjustment = 0;
256  // normalize features
257  for (unsigned int i=0; i<Bn.size(); ++i)
258  {
259  if (std::isinf(scale.at(i)))
260  {
261  continue;
262  }
263  float b = Bn(i);
264  // scale, potentially skipping binary and categorical rows
265  if (this->scale_all || dtypes.at(i)=='f')
266  {
267  if (scale.at(i) > NEAR_ZERO)
268  adjustment += b*offset.at(i)/scale.at(i);
269  }
270  }
271  return init_offset - adjustment;
272  }
273  template <typename T>
274  float adjust_offset(const vector<T>& Bn, float init_offset) const
275  {
276  auto w = Map<const Eigen::Matrix<T,Dynamic,1>>(Bn.data(), Bn.size());
277  return this->adjust_offset(w, init_offset);
278 
279  }
280  template <typename T>
281  float adjust_offset(const shogun::SGVector<T>& Bn, float init_offset) const
282  {
283  auto w = Map<const Eigen::Matrix<T,Dynamic,1>>(Bn.data(), Bn.size());
284  return this->adjust_offset(w, init_offset);
285 
286  }
288  template <typename T>
289  void invert(MatrixBase<T>& X) const
290  {
291  cout << "inverting X = " << X << endl;
292  // normalize features
293  for (unsigned int i=0; i<X.rows(); ++i)
294  {
295  if (std::isinf(scale.at(i)))
296  {
297  /* X.row(i) = Matrix<T, Dynamic, 1>::Zero(X.row(i).size()); */
298  continue;
299  }
300  // scale, potentially skipping binary and categorical rows
301  if (this->scale_all || dtypes.at(i)=='f')
302  {
303  cout << "X.row(i) = X.row(i).array()*scale.at(i) : \n\t";
304  cout << " = " << X.row(i).array() << "*" << scale.at(i) << endl;
305  if (scale.at(i) > NEAR_ZERO)
306  X.row(i) = X.row(i).array()*scale.at(i);
307  cout << "X.row(i) = X.row(i).array() + offset.at(i) : \n\t";
308  cout << " = " << X.row(i).array() << " + " << offset.at(i) << endl;
309  X.row(i) = X.row(i).array() + offset.at(i);
310  }
311  }
312  }
314  template <typename T>
315  void fit_normalize(MatrixBase<T>& X,
316  const vector<char>& dtypes)
317  {
318  this->fit(X, dtypes);
319  this->normalize(X);
320  }
321 };
322 NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Normalizer, scale, offset, dtypes, scale_all)
323 
324 
325 ArrayXb isinf(const ArrayXf& x);
327 
329 ArrayXb isnan(const ArrayXf& x);
330 
332 vector<char> find_dtypes(const MatrixXf &X);
333 
335 template <typename T>
336 vector<T> unique(vector<T> w)
337 {
338  std::sort(w.begin(),w.end());
339  typename vector<T>::iterator it;
340  it = std::unique(w.begin(),w.end());
341  w.resize(std::distance(w.begin(), it));
342  return w;
343 }
344 
346 template <typename T>
347 vector<T> unique(Matrix<T, -1, -1> w)
348 {
349  vector<T> wv( w.data(), w.data()+w.size());
350  return unique(wv);
351 }
352 
354 template <typename T>
355 vector<T> unique(Matrix<T, -1, 1> w)
356 {
357  vector<T> wv( w.data(), w.data()+w.size());
358  return unique(wv);
359 }
360 
362 template <typename T>
363 vector<T> unique(Array<T, -1, 1> w)
364 {
365  vector<T> wv( w.data(), w.data()+w.rows()*w.cols());
366  return unique(wv);
367 }
368 
370 float condition_number(const MatrixXf& X);
371 
373 MatrixXf corrcoef(const MatrixXf& X);
374 
375 // returns the mean of the pairwise correlations of a matrix.
376 float mean_square_corrcoef(const MatrixXf& X);
377 
379 int argmiddle(vector<float>& v);
380 
381 struct Log_Stats
382 {
383  vector<int> generation;
384  vector<float> time;
385  vector<float> min_loss;
386  vector<float> min_loss_v;
387  vector<float> med_loss;
388  vector<float> med_loss_v;
389  vector<unsigned> med_size;
390  vector<unsigned> med_complexity;
391  vector<unsigned> med_num_params;
392  vector<unsigned> med_dim;
393 
394  void update(int index,
395  float timer_count,
396  float bst_score,
397  float bst_score_v,
398  float md_score,
399  float md_loss_v,
400  unsigned md_size,
401  unsigned md_complexity,
402  unsigned md_num_params,
403  unsigned md_dim);
404 };
405 
406 typedef struct Log_Stats Log_stats;
407 
409  generation,
410  time,
411  min_loss,
412  min_loss_v,
413  med_loss,
414  med_loss_v,
415  med_size,
416  med_complexity,
417  med_num_params,
418  med_dim);
419 
421 template <typename T>
422 std::string to_string(const T& value)
423 {
424  std::stringstream ss;
425  ss << value;
426  return ss.str();
427 }
428 
429 template <typename T>
430 std::string to_string(const T a_value, const int n)
431 {
432  std::ostringstream out;
433  out.precision(n);
434  out << std::fixed << a_value;
435  return out.str();
436 }
437 
439 std::string ravel(const vector<string>& v, string sep=",");
440 
441 } // Util
442 
443 } // FT
444 #endif
class for timing things.
Definition: utils.h:104
friend std::basic_ostream< T, Traits > & operator<<(std::basic_ostream< T, Traits > &out, const Timer &timer)
Definition: utils.h:117
std::chrono::duration< float > Elapsed() const
Definition: utils.cc:211
high_resolution_clock::time_point _start
Definition: utils.h:124
std::chrono::high_resolution_clock high_resolution_clock
Definition: utils.h:105
std::chrono::seconds seconds
Definition: utils.h:107
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition: data.h:21
float condition_number(const MatrixXf &X)
returns the condition number of a matrix.
Definition: utils.cc:236
std::string ltrim(std::string str, const std::string &chars)
Definition: utils.cc:31
float skew(const ArrayXf &v)
calculate skew
Definition: utils.cc:141
float mad(const ArrayXf &x)
median absolute deviation
Definition: utils.cc:189
ArrayXb isinf(const ArrayXf &x)
returns true for elements of x that are infinite
Definition: utils.cc:217
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition: utils.cc:177
ArrayXb isnan(const ArrayXf &x)
returns true for elements of x that are NaN
Definition: utils.cc:226
vector< T > softmax(const vector< T > &w)
return the softmax transformation of a vector.
Definition: utils.h:130
std::string ravel(const vector< string > &v, string sep)
takes a vector string and returns it as a delimited string.
Definition: utils.cc:297
std::string trim(std::string str, const std::string &chars)
Definition: utils.cc:43
float mean_square_corrcoef(const MatrixXf &X)
Definition: utils.cc:266
float covariance(const ArrayXf &x, const ArrayXf &y)
covariance of x and y
Definition: utils.cc:164
int argmiddle(vector< float > &v)
returns the (first) index of the element with the middlest value in v
Definition: utils.cc:109
MatrixXf corrcoef(const MatrixXf &X)
returns the pearson correlation coefficients of matrix.
Definition: utils.cc:254
string PBSTR
Definition: utils.cc:14
float pearson_correlation(const ArrayXf &x, const ArrayXf &y)
the normalized covariance of x and y
Definition: utils.cc:184
vector< char > find_dtypes(const MatrixXf &X)
determines data types of columns of matrix X.
Definition: utils.cc:49
vector< T > unique(Array< T, -1, 1 > w)
returns unique elements in 1d Eigen array
Definition: utils.h:363
float kurtosis(const ArrayXf &v)
calculate kurtosis
Definition: utils.cc:153
int PBWIDTH
Definition: utils.cc:15
float median(const ArrayXf &v)
calculate median
Definition: utils.cc:89
std::string to_string(const T a_value, const int n)
Definition: utils.h:430
vector< size_t > argsort(const vector< T > &v, bool ascending=true)
return indices that sort a vector
Definition: utils.h:81
bool in(const vector< T > v, const T &i)
check if element is in vector.
Definition: utils.h:47
std::string rtrim(std::string str, const std::string &chars)
Definition: utils.cc:37
float variance(const ArrayXf &v, float mean)
calculate variance when mean provided
Definition: utils.cc:127
void clean(ArrayXf &x)
limits node output to be between MIN_FLT and MAX_FLT
Definition: utils.cc:18
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Log_Stats, generation, time, min_loss, min_loss_v, med_loss, med_loss_v, med_size, med_complexity, med_num_params, med_dim)
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
static float NEAR_ZERO
Definition: init.h:46
vector< unsigned > med_size
Definition: utils.h:389
vector< float > min_loss_v
Definition: utils.h:386
vector< float > med_loss_v
Definition: utils.h:388
vector< float > time
Definition: utils.h:384
vector< unsigned > med_num_params
Definition: utils.h:391
vector< unsigned > med_dim
Definition: utils.h:392
vector< float > min_loss
Definition: utils.h:385
vector< unsigned > med_complexity
Definition: utils.h:390
vector< int > generation
Definition: utils.h:383
vector< float > med_loss
Definition: utils.h:387
normalizes a matrix to unit variance, 0 mean centered.
Definition: utils.h:147
vector< char > dtypes
Definition: utils.h:155
void fit_normalize(MatrixBase< T > &X, const vector< char > &dtypes)
fit then normalize
Definition: utils.h:315
void adjust_weights(shogun::SGVector< T > &B) const
Definition: utils.h:235
vector< float > scale
Definition: utils.h:151
void adjust_weights(vector< T > &B) const
Definition: utils.h:242
float adjust_offset(const shogun::SGVector< T > &Bn, float init_offset) const
Definition: utils.h:281
float adjust_offset(const MatrixBase< T > &Bn, float init_offset) const
Definition: utils.h:249
Normalizer(bool sa=true, bool rm_offset=true)
Definition: utils.h:148
float adjust_offset(const vector< T > &Bn, float init_offset) const
Definition: utils.h:274
void fit(const MatrixBase< T > &X, const vector< char > &dt)
fit the scale and offset of data.
Definition: utils.h:161
void adjust_weights(MatrixBase< T > &B) const
return weights of a linear model, y = B*X, given weights of
Definition: utils.h:215
void invert(MatrixBase< T > &X) const
inverse normalize a matrix.
Definition: utils.h:289
void normalize(MatrixBase< T > &X) const
normalize matrix.
Definition: utils.h:191
vector< float > offset
Definition: utils.h:154