Feat C++ API
A feature engineering automation tool
utils.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "utils.h"
7 #include "rnd.h"
8 #include <unordered_set>
9 
10 namespace FT{
11 
12 namespace Util{
13 
14 string PBSTR = "====================";
15 int PBWIDTH = 20;
16 
18 void clean(ArrayXf& x)
19 {
20  x = (x < MIN_FLT).select(MIN_FLT,x);
21  x = (isinf(x)).select(MAX_FLT,x);
22  x = (isnan(x)).select(0,x);
23 };
24 void clean(VectorXf& x)
25 {
26  ArrayXf y = ArrayXf(x);
27  clean(y);
28  x = VectorXf(y);
29 }
30 
31 std::string ltrim(std::string str, const std::string& chars)
32 {
33  str.erase(0, str.find_first_not_of(chars));
34  return str;
35 }
36 
37 std::string rtrim(std::string str, const std::string& chars)
38 {
39  str.erase(str.find_last_not_of(chars) + 1);
40  return str;
41 }
42 
43 std::string trim(std::string str, const std::string& chars)
44 {
45  return ltrim(rtrim(str, chars), chars);
46 }
47 
49 vector<char> find_dtypes(const MatrixXf &X)
50 {
51  vector<char> dtypes;
52 
53  // get feature types (binary or continuous/categorical)
54  int i, j;
55  bool isBinary;
56  bool isCategorical;
57  std::map<float, bool> uniqueMap;
58  for(i = 0; i < X.rows(); i++)
59  {
60  isBinary = true;
61  isCategorical = true;
62  uniqueMap.clear();
63 
64  for(j = 0; j < X.cols(); j++)
65  {
66  if(X(i, j) != 0 && X(i, j) != 1)
67  isBinary = false;
68  if(X(i,j) != floor(X(i, j)) && X(i,j) != ceil(X(i,j)))
69  isCategorical = false;
70  else
71  uniqueMap[X(i, j)] = true;
72  }
73 
74  if(isBinary)
75  dtypes.push_back('b');
76  else
77  {
78  if(isCategorical && uniqueMap.size() < 10)
79  dtypes.push_back('c');
80  else
81  dtypes.push_back('f');
82  }
83  }
84  return dtypes;
85 
86 }
87 
89 float median(const ArrayXf& v)
90 {
91  // instantiate a vector
92  vector<float> x(v.size());
93  x.assign(v.data(),v.data()+v.size());
94  // middle element
95  size_t n = x.size()/2;
96  // sort nth element of array
97  nth_element(x.begin(),x.begin()+n,x.end());
98  // if evenly sized, return average of middle two elements
99  if (x.size() % 2 == 0) {
100  nth_element(x.begin(),x.begin()+n-1,x.end());
101  return (x.at(n) + x.at(n-1)) / 2;
102  }
103  // otherwise return middle element
104  else
105  return x.at(n);
106 }
107 
109 int argmiddle(vector<float>& v)
110 {
111  // instantiate a vector
112  vector<float> x = v;
113  // middle iterator
114  std::vector<float>::iterator middle = x.begin() + x.size()/2;
115  // sort nth element of array
116  nth_element(x.begin(), middle, x.end());
117  // find position of middle value in original array
118  std::vector<float>::iterator it = std::find(v.begin(), v.end(), *middle);
119 
120  std::vector<float>::size_type pos = std::distance(v.begin(), it);
121  /* cout << "middle index: " << pos << "\n"; */
122  /* cout << "middle value: " << *it << "\n"; */
123  return pos;
124 }
125 
127 float variance(const ArrayXf& v, float mean)
128 {
129  ArrayXf tmp = mean*ArrayXf::Ones(v.size());
130  return pow((v - tmp), 2).mean();
131 }
132 
134 float variance(const ArrayXf& v)
135 {
136  float mean = v.mean();
137  return variance(v, mean);
138 }
139 
141 float skew(const ArrayXf& v)
142 {
143  float mean = v.mean();
144  ArrayXf tmp = mean*ArrayXf::Ones(v.size());
145 
146  float thirdMoment = pow((v - tmp), 3).mean();
147  float variance = pow((v - tmp), 2).mean();
148 
149  return thirdMoment/sqrt(pow(variance, 3));
150 }
151 
153 float kurtosis(const ArrayXf& v)
154 {
155  float mean = v.mean();
156  ArrayXf tmp = mean*ArrayXf::Ones(v.size());
157 
158  float fourthMoment = pow((v - tmp), 4).mean();
159  float variance = pow((v - tmp), 2).mean();
160 
161  return fourthMoment/pow(variance, 2);
162 }
163 
164 float covariance(const ArrayXf& x, const ArrayXf& y)
165 {
166  float meanX = x.mean();
167  float meanY = y.mean();
168  //float count = x.size();
169 
170  ArrayXf tmp1 = meanX*ArrayXf::Ones(x.size());
171  ArrayXf tmp2 = meanY*ArrayXf::Ones(y.size());
172 
173  return ((x - tmp1)*(y - tmp2)).mean();
174 
175 }
176 
177 float slope(const ArrayXf& x, const ArrayXf& y)
178  // y: rise dimension, x: run dimension. slope = rise/run
179 {
180  return covariance(x, y)/variance(x);
181 }
182 
183 // Pearson correlation
184 float pearson_correlation(const ArrayXf& x, const ArrayXf& y)
185 {
186  return pow(covariance(x,y),2) / (variance(x) * variance(y));
187 }
189 float mad(const ArrayXf& x)
190 {
191  // returns median absolute deviation (MAD)
192  // get median of x
193  float x_median = median(x);
194  //calculate absolute deviation from median
195  ArrayXf dev(x.size());
196  for (int i =0; i < x.size(); ++i)
197  dev(i) = fabs(x(i) - x_median);
198  // return median of the absolute deviation
199  return median(dev);
200 }
201 
202 Timer::Timer(bool run)
203 {
204  if (run)
205  Reset();
206 }
208 {
209  _start = high_resolution_clock::now();
210 }
211 std::chrono::duration<float> Timer::Elapsed() const
212 {
213  return high_resolution_clock::now() - _start;
214 }
215 
217 ArrayXb isinf(const ArrayXf& x)
218 {
219  ArrayXb infs(x.size());
220  for (unsigned i =0; i < infs.size(); ++i)
221  infs(i) = std::isinf(x(i));
222  return infs;
223 }
224 
226 ArrayXb isnan(const ArrayXf& x)
227 {
228  ArrayXb nans(x.size());
229  for (unsigned i =0; i < nans.size(); ++i)
230  nans(i) = std::isnan(x(i));
231  return nans;
232 
233 }
234 
236 float condition_number(const MatrixXf& X)
237 {
238  BDCSVD<MatrixXf> svd(X);
239  float cond=MAX_FLT;
240  ArrayXf svals = svd.singularValues();
241  if (svals.size()>0)
242  {
243  cond= svals(0) / svals(svals.size()-1);
244  }
245 
246  if (std::isnan(cond) || std::isinf(cond))
247  return MAX_FLT;
248 
249  return cond;
250 
251 }
252 
254 MatrixXf corrcoef(const MatrixXf& X)
255 {
256  MatrixXf centered = X.colwise() - X.rowwise().mean();
257 
258  MatrixXf cov = ( centered * centered.adjoint()) / float(X.cols() - 1);
259  VectorXf tmp = 1/cov.diagonal().array().sqrt();
260  auto d = tmp.asDiagonal();
261  MatrixXf corrcoef = d * cov * d;
262  return corrcoef;
263 }
264 
265 // returns the mean of the pairwise correlations of a matrix.
266 float mean_square_corrcoef(const MatrixXf& X)
267 {
268  MatrixXf tmp = corrcoef(X).triangularView<StrictlyUpper>();
269  float N = tmp.rows()*(tmp.rows()-1)/2;
270  /* cout << "triangular strictly upper view: " << tmp << "\n"; */
271  return tmp.array().square().sum()/N;
272 }
273 
274 void Log_Stats::update(int index,
275  float timer_count,
276  float bst_score,
277  float bst_score_v,
278  float md_score,
279  float md_loss_v,
280  unsigned md_size,
281  unsigned md_complexity,
282  unsigned md_num_params,
283  unsigned md_dim)
284 {
285  generation.push_back(index+1);
286  time.push_back(timer_count);
287  min_loss.push_back(bst_score);
288  min_loss_v.push_back(bst_score_v);
289  med_loss.push_back(md_score);
290  med_loss_v.push_back(md_loss_v);
291  med_size.push_back(md_size);
292  med_complexity.push_back(md_complexity);
293  med_num_params.push_back(md_num_params);
294  med_dim.push_back(md_dim);
295 }
296 
297 std::string ravel(const vector<string>& v, string sep)
298 {
299  string out = "";
300  for (int i = 0; i < v.size(); ++i)
301  {
302  out += v.at(i);
303  if (i < v.size() - 1)
304  out += sep;
305  }
306  return out;
307 }
308 
309 }
310 
311 }
std::chrono::duration< float > Elapsed() const
Definition: utils.cc:211
high_resolution_clock::time_point _start
Definition: utils.h:124
void Reset()
Definition: utils.cc:207
Timer(bool run=false)
Definition: utils.cc:202
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition: data.h:21
float condition_number(const MatrixXf &X)
returns the condition number of a matrix.
Definition: utils.cc:236
std::string ltrim(std::string str, const std::string &chars)
Definition: utils.cc:31
float skew(const ArrayXf &v)
calculate skew
Definition: utils.cc:141
float mad(const ArrayXf &x)
median absolute deviation
Definition: utils.cc:189
ArrayXb isinf(const ArrayXf &x)
returns true for elements of x that are infinite
Definition: utils.cc:217
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition: utils.cc:177
ArrayXb isnan(const ArrayXf &x)
returns true for elements of x that are NaN
Definition: utils.cc:226
std::string ravel(const vector< string > &v, string sep)
takes a vector string and returns it as a delimited string.
Definition: utils.cc:297
std::string trim(std::string str, const std::string &chars)
Definition: utils.cc:43
float mean_square_corrcoef(const MatrixXf &X)
Definition: utils.cc:266
float covariance(const ArrayXf &x, const ArrayXf &y)
covariance of x and y
Definition: utils.cc:164
int argmiddle(vector< float > &v)
returns the (first) index of the element with the middlest value in v
Definition: utils.cc:109
MatrixXf corrcoef(const MatrixXf &X)
returns the pearson correlation coefficients of matrix.
Definition: utils.cc:254
string PBSTR
Definition: utils.cc:14
float pearson_correlation(const ArrayXf &x, const ArrayXf &y)
the normalized covariance of x and y
Definition: utils.cc:184
vector< char > find_dtypes(const MatrixXf &X)
determines data types of columns of matrix X.
Definition: utils.cc:49
float kurtosis(const ArrayXf &v)
calculate kurtosis
Definition: utils.cc:153
int PBWIDTH
Definition: utils.cc:15
float median(const ArrayXf &v)
calculate median
Definition: utils.cc:89
std::string rtrim(std::string str, const std::string &chars)
Definition: utils.cc:37
float variance(const ArrayXf &v, float mean)
calculate variance when mean provided
Definition: utils.cc:127
void clean(ArrayXf &x)
limits node output to be between MIN_FLT and MAX_FLT
Definition: utils.cc:18
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
static float MAX_FLT
Definition: init.h:47
static float MIN_FLT
Definition: init.h:48
vector< unsigned > med_size
Definition: utils.h:389
vector< float > min_loss_v
Definition: utils.h:386
vector< float > med_loss_v
Definition: utils.h:388
vector< float > time
Definition: utils.h:384
vector< unsigned > med_num_params
Definition: utils.h:391
vector< unsigned > med_dim
Definition: utils.h:392
void update(int index, float timer_count, float bst_score, float bst_score_v, float md_score, float md_loss_v, unsigned md_size, unsigned md_complexity, unsigned md_num_params, unsigned md_dim)
Definition: utils.cc:274
vector< float > min_loss
Definition: utils.h:385
vector< unsigned > med_complexity
Definition: utils.h:390
vector< int > generation
Definition: utils.h:383
vector< float > med_loss
Definition: utils.h:387