Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
utils.cpp
Go to the documentation of this file.
1/* Brush
2copyright 2017 William La Cava
3license: GNU/GPL v3
4*/
5
6#include "utils.h"
7#include "rnd.h"
8#include <unordered_set>
9
10namespace Brush{
11namespace Util{
12
13string PBSTR = "====================";
14int PBWIDTH = 20;
15
17void clean(ArrayXf& x)
18{
19 x = (x < MIN_FLT).select(MIN_FLT,x);
20 x = (isinf(x)).select(MAX_FLT,x);
21 x = (isnan(x)).select(0,x);
22};
23
24std::string ltrim(std::string str, const std::string& chars)
25{
26 str.erase(0, str.find_first_not_of(chars));
27 return str;
28}
29
30std::string rtrim(std::string str, const std::string& chars)
31{
32 str.erase(str.find_last_not_of(chars) + 1);
33 return str;
34}
35
36std::string trim(std::string str, const std::string& chars)
37{
38 return ltrim(rtrim(str, chars), chars);
39}
40
41vector<type_index> get_dtypes(MatrixXf &X)
42{
43 vector<type_index> dtypes;
44
45 // get feature types (binary or continuous/categorical)
46 int i, j;
47 bool isBinary;
48 bool isCategorical;
49 std::map<float, bool> uniqueMap;
50 for(i = 0; i < X.cols(); i++)
51 {
52 isBinary = true;
53 isCategorical = true;
54 uniqueMap.clear();
55
56 for(j = 0; j < X.cols(); j++)
57 {
58 if(X(i, j) != 0 && X(i, j) != 1)
59 isBinary = false;
60 if(X(i,j) != floor(X(i, j)) && X(i,j) != ceil(X(i,j)))
61 isCategorical = false;
62 else
63 uniqueMap[X(i, j)] = true;
64 }
65
66 if(isBinary)
67 dtypes.push_back(typeid(ArrayXb));
68 else
69 {
70 if(isCategorical && uniqueMap.size() < 10)
71 dtypes.push_back(typeid( ArrayXi ));
72 else
73 dtypes.push_back(typeid(ArrayXf));
74 }
75 }
76 /* cout << "dtypes: " ; */
77 /* for (const auto& dt : dtypes) */
78 /* cout << dt << ", "; */
79 /* cout << "\n"; */
80 return dtypes;
81
82}
83
84
85Timer::Timer(bool run)
86{
87 if (run)
88 Reset();
89}
91{
92 _start = high_resolution_clock::now();
93}
94std::chrono::duration<float> Timer::Elapsed() const
95{
96 return high_resolution_clock::now() - _start;
97}
98
100void Normalizer::fit(MatrixXf& X, const vector<char>& dt)
101{
102 scale.clear();
103 offset.clear();
104 dtypes = dt;
105 for (unsigned int i=0; i<X.cols(); ++i)
106 {
107 // mean center
108 VectorXf tmp = X.col(i).array()-X.col(i).mean();
109 // scale by the standard deviation
110 scale.push_back(std::sqrt((tmp.array()).square().sum()/(tmp.size())));
111 offset.push_back(X.col(i).mean());
112 }
113
114}
115
118{
119 // normalize features
120 for (unsigned int i=0; i<X.cols(); ++i)
121 {
122 if (std::isinf(scale.at(i)))
123 {
124 X.col(i) = VectorXf::Zero(X.col(i).size());
125 continue;
126 }
127 // scale, potentially skipping binary and categorical cols
128 if (this->scale_all || dtypes.at(i)=='f')
129 {
130 X.col(i) = X.col(i).array() - offset.at(i);
131 if (scale.at(i) > NEAR_ZERO)
132 X.col(i) = X.col(i).array()/scale.at(i);
133 }
134 }
135}
136
137void Normalizer::fit_normalize(MatrixXf& X, const vector<char>& dtypes)
138{
139 fit(X, dtypes);
140 normalize(X);
141}
142
144// ArrayXb isinf(const ArrayXf& x)
145// {
146// ArrayXb infs(x.size());
147// for (unsigned i =0; i < infs.size(); ++i)
148// infs(i) = std::isinf(x(i));
149// return infs;
150// }
151
152// /// returns true for elements of x that are NaN
153// ArrayXb isnan(const ArrayXf& x)
154// {
155// ArrayXb nans(x.size());
156// for (unsigned i =0; i < nans.size(); ++i)
157// nans(i) = std::isnan(x(i));
158// return nans;
159
160// }
161
162/* Defined in utils.h
164template <typename T>
165string to_string(const T& value)
166{
167 std::stringstream ss;
168 ss << value;
169 return ss.str();
170}*/
172float condition_number(const MatrixXf& X)
173{
174 /* cout << "X (" << X.cols() << "x" << X.cols() << "): " << X.transpose() << "\n"; */
175 /* MatrixXf Y = X; */
176 /* try */
177 /* { */
178 /* JacobiSVD<MatrixXf> svd(Y); */
180 /* cout << "JacobiSVD declared\n"; */
181 float cond=MAX_FLT;
182 /* cout << "running svals\n"; */
183 ArrayXf svals = svd.singularValues();
184 /* cout << "svals: " << svals.transpose() << "\n"; */
185 if (svals.size()>0)
186 {
187 cond= svals(0) / svals(svals.size()-1);
188 }
189 /* cout << "CN: " + std::to_string(cond) + "\n"; */
190 return cond;
191
192 /* } */
193 /* catch (...) */
194 /* { */
195 return MAX_FLT;
196 /* } */
197}
198
200MatrixXf corrcoef(const MatrixXf& X)
201{
202 MatrixXf centered = X.colwise() - X.rowwise().mean();
203
204 /* std::cout << "centered: " << centered.rows() << "x" << centered.cols() << ": " */
205 /* << centered << "\n\n"; */
206 MatrixXf cov = ( centered * centered.adjoint()) / float(X.cols() - 1);
207 /* std::cout << "cov: " << cov.rows() << "x" << cov.cols() << ": " << cov << "\n\n"; */
208 VectorXf tmp = 1/cov.diagonal().array().sqrt();
209 auto d = tmp.asDiagonal();
210 /* std::cout << "1/sqrt(diag(cov)): " << d.rows() << "x" << d.cols() << ": " */
211 /* << d.diagonal() << "\n"; */
212 MatrixXf corrcoef = d * cov * d;
213 /* std::cout << "cov/d: " << corrcoef.rows() << "x" << corrcoef.cols() << ": " */
214 /* << corrcoef << "\n"; */
215 return corrcoef;
216}
217
218// returns the mean of the pairwise correlations of a matrix.
219float mean_square_corrcoef(const MatrixXf& X)
220{
221 MatrixXf tmp = corrcoef(X).triangularView<StrictlyUpper>();
222 float N = tmp.rows()*(tmp.rows()-1)/2;
223 /* cout << "triangular strictly upper view: " << tmp << "\n"; */
224 return tmp.array().square().sum()/N;
225}
226
228 float timer_count,
229 float bst_score,
230 float bst_score_v,
231 float md_score,
232 float md_score_v,
233 unsigned md_size,
234 unsigned md_complexity,
235 unsigned mx_size,
236 unsigned mx_complexity
237 )
238{
239 generation.push_back(index+1);
240 time.push_back(timer_count);
241
242 best_score.push_back(bst_score);
243 best_score_v.push_back(bst_score_v);
244 med_score.push_back(md_score);
245 med_score_v.push_back(md_score_v);
246
247 med_size.push_back(md_size);
248 med_complexity.push_back(md_complexity);
249
250 max_size.push_back(mx_size);
251 max_complexity.push_back(mx_complexity);
252}
253
254/* array<ArrayXf, 2> split(ArrayXf& v, ArrayXb& mask) */
255/* { */
256/* int size1 = mask.count(); */
257/* int size2 = mask.size() - size1; */
258/* ArrayXf L(size1), R(size2); */
259/* int idx1 = 0, idx2 = 0; */
260
261/* for (int i = 0; i < mask.size(); ++i) */
262/* { */
263/* if (mask(i)) */
264/* { */
265/* L(idx1) = v(i); */
266/* ++idx1; */
267/* } */
268/* else */
269/* { */
270/* R(idx2) = v(i); */
271/* ++idx2; */
272/* } */
273
274/* } */
275
276/* return {L, R}; */
277
278/* } */
279// TYPES = {INT, FLOAT, BOOL, ARRAYXB, ARRAYXI, ARRAYXF, LONG };
280
282 { typeid(int) , "int" },
283 { typeid(float) , "float" },
284 { typeid(bool) , "bool" },
285 { typeid(ArrayXf) , "ArrayXf" },
286 { typeid(ArrayXi) , "ArrayXi" },
287 { typeid(ArrayXb) , "ArrayXb" }
288 };
289// TypeMap<TYPES> type_enum = {
290// { typeid(int) , TYPES::INT },
291// { typeid(float) , TYPES::FLOAT },
292// { typeid(bool) , TYPES::BOOL },
293// { typeid(ArrayXf) , TYPES::ARRAYXF },
294// { typeid(ArrayXi) , TYPES::ARRAYXI },
295// { typeid(ArrayXb) , TYPES::ARRAYXB },
296// { typeid(Longitudinal), TYPES::LONG }
297// };
298
300int argmiddle(vector<float>& v)
301{
302 // instantiate a vector
303 vector<float> x = v;
304 // middle iterator
305 std::vector<float>::iterator middle = x.begin() + x.size()/2;
306 // sort nth element of array
307 nth_element(x.begin(), middle, x.end());
308 // find position of middle value in original array
309 std::vector<float>::iterator it = std::find(v.begin(), v.end(), *middle);
310
311 std::vector<float>::size_type pos = std::distance(v.begin(), it);
312 /* cout << "middle index: " << pos << "\n"; */
313 /* cout << "middle value: " << *it << "\n"; */
314 return pos;
315}
317float variance(const ArrayXf& v)
318{
319 return pow((v - v.mean()), 2).mean();
320};
321
323float skew(const ArrayXf& v)
324{
325 float mean = v.mean();
326 ArrayXf tmp = mean*ArrayXf::Ones(v.size());
327
328 float thirdMoment = pow((v - tmp), 3).mean();
329 float variance = pow((v - tmp), 2).mean();
330
331 return thirdMoment/sqrt(pow(variance, 3));
332};
333
335float kurtosis(const ArrayXf& v)
336{
337 float mean = v.mean();
338 ArrayXf tmp = mean*ArrayXf::Ones(v.size());
339
340 float fourthMoment = pow((v - tmp), 4).mean();
341 float variance = pow((v - tmp), 2).mean();
342
343 return fourthMoment/pow(variance, 2);
344};
345
346float covariance(const ArrayXf& x, const ArrayXf& y)
347{
348 float meanX = x.mean();
349 float meanY = y.mean();
350 //float count = x.size();
351
352 ArrayXf tmp1 = meanX*ArrayXf::Ones(x.size());
353 ArrayXf tmp2 = meanY*ArrayXf::Ones(y.size());
354
355 return ((x - tmp1)*(y - tmp2)).mean();
356
357};
358
359float slope(const ArrayXf& x, const ArrayXf& y)
360 // y: rise dimension, x: run dimension. slope = rise/run
361{
362 return covariance(x, y)/variance(x);
363};
364
365// Pearson correlation
366float pearson_correlation(const ArrayXf& x, const ArrayXf& y)
367{
368 return pow(covariance(x,y),2) / (variance(x) * variance(y));
369};
370
371
373float mad(const ArrayXf& x)
374{
375 // returns median absolute deviation (MAD)
376 // get median of x
377 float x_median = median(x);
378 //calculate absolute deviation from median
379 ArrayXf dev(x.size());
380 for (int i =0; i < x.size(); ++i)
381 dev(i) = fabs(x(i) - x_median);
382 // return median of the absolute deviation
383 return median(dev);
384};
385
387std::string ReplaceString(std::string subject, const std::string& search,
388 const std::string& replace)
389{
390 size_t pos = 0;
391 while ((pos = subject.find(search, pos)) != std::string::npos) {
392 subject.replace(pos, search.length(), replace);
393 pos += replace.length();
394 }
395 return subject;
396}
398void ReplaceStringInPlace(std::string& subject, const std::string& search,
399 const std::string& replace)
400{
401 size_t pos = 0;
402 while ((pos = subject.find(search, pos)) != std::string::npos) {
403 subject.replace(pos, search.length(), replace);
404 pos += replace.length();
405 }
406}
407
409vector<size_t> mask_to_index(const ArrayXb& mask)
410{
411 auto tmp = mask.cast<int>();
412 vector<size_t> idx;
413 for (int i = 0; i < mask.size(); ++i)
414 {
415 if (mask(i))
416 idx.push_back(i);
417 }
418 return idx;
419}
421tuple<vector<size_t>,vector<size_t>> mask_to_indices(const ArrayXb& mask)
422{
423 tuple<vector<size_t>,vector<size_t>> indices({},{});
424 for (int i = 0; i < mask.size(); ++i)
425 {
426 if (mask(i))
427 std::get<0>(indices).push_back(i);
428 else
429 std::get<1>(indices).push_back(i);
430 }
431 return indices;
432}
433
434} // Util
435} // Brush
void bind_engine(py::module &m, string name)
std::chrono::duration< float > Elapsed() const
Definition utils.cpp:94
Timer(bool run=false)
Definition utils.cpp:85
high_resolution_clock::time_point _start
Definition utils.h:290
static float MAX_FLT
Definition init.h:61
static float MIN_FLT
Definition init.h:62
static float NEAR_ZERO
Definition init.h:60
float mean_square_corrcoef(const MatrixXf &X)
Definition utils.cpp:219
std::string ReplaceString(std::string subject, const std::string &search, const std::string &replace)
find and replace string
Definition utils.cpp:387
MatrixXf corrcoef(const MatrixXf &X)
returns the pearson correlation coefficients of matrix.
Definition utils.cpp:200
vector< type_index > get_dtypes(MatrixXf &X)
calculates data types for each column of X
Definition utils.cpp:41
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition utils.cpp:359
float mad(const ArrayXf &x)
median absolute deviation
Definition utils.cpp:373
float condition_number(const MatrixXf &X)
returns true for elements of x that are infinite
Definition utils.cpp:172
std::string ltrim(std::string str, const std::string &chars)
Definition utils.cpp:24
float skew(const ArrayXf &v)
calculate skew
Definition utils.cpp:323
float pearson_correlation(const ArrayXf &x, const ArrayXf &y)
the normalized covariance of x and y
Definition utils.cpp:366
tuple< vector< size_t >, vector< size_t > > mask_to_indices(const ArrayXb &mask)
returns 2 indices: first where mask is true, and second where mask is false.
Definition utils.cpp:421
Scalar median(const T &v)
calculate median
Definition utils.h:202
void clean(ArrayXf &x)
limits node output to be between MIN_FLT and MAX_FLT
Definition utils.cpp:17
float kurtosis(const ArrayXf &v)
calculate kurtosis
Definition utils.cpp:335
TypeMap< std::string > type_names
Definition utils.cpp:281
std::string rtrim(std::string str, const std::string &chars)
Definition utils.cpp:30
void ReplaceStringInPlace(std::string &subject, const std::string &search, const std::string &replace)
string find and replace in place
Definition utils.cpp:398
std::string trim(std::string str, const std::string &chars)
Definition utils.cpp:36
float variance(const ArrayXf &v)
calculate variance
Definition utils.cpp:317
std::map< std::type_index, T > TypeMap
Definition utils.h:175
float covariance(const ArrayXf &x, const ArrayXf &y)
covariance of x and y
Definition utils.cpp:346
string PBSTR
Definition utils.cpp:13
int argmiddle(vector< float > &v)
returns the (first) index of the element with the middlest value in v
Definition utils.cpp:300
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
Definition utils.cpp:409
int PBWIDTH
Definition utils.cpp:14
< nsga2 selection operator for getting the front
Definition data.cpp:12
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
vector< unsigned > max_size
Definition utils.h:409
void update(int index, float timer_count, float bst_score, float bst_score_v, float md_score, float md_score_v, unsigned md_size, unsigned md_complexity, unsigned mx_size, unsigned mx_complexity)
Definition utils.cpp:227
vector< unsigned > max_complexity
Definition utils.h:410
vector< float > med_score_v
Definition utils.h:405
vector< unsigned > med_size
Definition utils.h:407
vector< float > med_score
Definition utils.h:404
vector< float > best_score_v
Definition utils.h:403
vector< unsigned > med_complexity
Definition utils.h:408
vector< float > best_score
Definition utils.h:402
vector< float > time
Definition utils.h:400
vector< int > generation
Definition utils.h:399
vector< float > offset
Definition utils.h:316
void fit(MatrixXf &X, const vector< char > &dt)
fit the scale and offset of data.
Definition utils.cpp:100
vector< char > dtypes
Definition utils.h:317
void fit_normalize(MatrixXf &X, const vector< char > &dtypes)
Definition utils.cpp:137
vector< float > scale
Definition utils.h:315
void normalize(MatrixXf &X)
normalize matrix.
Definition utils.cpp:117