Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.cpp
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6//internal includes
7#include "data.h"
8
9using namespace Brush::Util;
10using std::min;
11
12namespace Brush{
13
14map<DataType,string> DataTypeName = {
15 {DataType::ArrayB, "ArrayB"},
16 {DataType::ArrayI, "ArrayI"},
17 {DataType::ArrayF, "ArrayF"},
18 {DataType::MatrixB, "MatrixB"},
19 {DataType::MatrixI, "MatrixI"},
20 {DataType::MatrixF, "MatrixF"},
21 {DataType::TimeSeriesB, "TimeSeriesB"},
22 {DataType::TimeSeriesI,"TimeSeriesI"},
23 {DataType::TimeSeriesF, "TimeSeriesF"},
24 {DataType::ArrayBJet, "ArrayBJet"},
25 {DataType::ArrayIJet, "ArrayIJet"},
26 {DataType::ArrayFJet, "ArrayFJet"},
27 {DataType::MatrixBJet, "MatrixBJet"},
28 {DataType::MatrixIJet, "MatrixIJet"},
29 {DataType::MatrixFJet, "MatrixFJet"},
30 {DataType::TimeSeriesBJet, "TimeSeriesBJet"},
31 {DataType::TimeSeriesIJet,"TimeSeriesIJet"},
32 {DataType::TimeSeriesFJet, "TimeSeriesFJet"}
33};
35
36const map<DataType,std::type_index> DataTypeID = {
37 {DataType::ArrayB, typeid(ArrayXb)},
38 {DataType::ArrayI, typeid(ArrayXi)},
39 {DataType::ArrayF, typeid(ArrayXf)},
40 {DataType::MatrixB, typeid(ArrayXXb)},
41 {DataType::MatrixI, typeid(ArrayXXi)},
42 {DataType::MatrixF, typeid(ArrayXXf)},
46};
47map<std::type_index,DataType> DataIDType = Util::reverse_map(DataTypeID);
48
49namespace Data{
50
51// we have 3 basic types (bool, integer, float), specialized into
52// arrays, matrices, and timeseries. Notice that all dataset and operators
53// right now only work with arrays. TODO: implement timeseries and matrices.
65
66// /// returns the type_index held in arg
68{
69 return StateTypes.at(arg.index());
70}
71State check_type(const ArrayXf& x, const string t)
72{
73 State tmp;
74
75 if (!t.empty())
76 {
77 // Use DataNameType to get the statetype given the string representation
78 DataType feature_type = DataNameType.at(t);
79
80 if (feature_type == DataType::ArrayB)
81 tmp = ArrayXb(x.cast<bool>());
82 else if (feature_type == DataType::ArrayI)
83 tmp = ArrayXi(x.cast<int>());
84 else if (feature_type == DataType::ArrayF)
85 tmp = ArrayXf(x.cast<float>());
86 else
88 "Invalid feature type. check_type does not support this type: " + t);
89 }
90 else
91 {
92 // get feature types (binary or continuous/categorical)
93 bool isBinary = true;
94 bool isCategorical = true;
95
96 std::map<float, bool> uniqueMap;
97 for(int i = 0; i < x.size(); i++)
98 {
99 if(x(i) != 0 && x(i) != 1)
100 isBinary = false;
101 if(x(i) != floor(x(i)) && x(i) != ceil(x(i)))
102 isCategorical = false;
103 else
104 uniqueMap[x(i)] = true;
105 }
106
107 if (isBinary)
108 {
109 tmp = ArrayXb(x.cast<bool>());
110 }
111 else
112 {
113 if(isCategorical && uniqueMap.size() <= 10)
114 {
115 tmp = ArrayXi(x.cast<int>());
116 }
117 else
118 {
119 tmp = x;
120 }
121 }
122 }
123
124 return tmp;
125}
126
127template<typename StateRef>
128State cast_type(const ArrayXf& x, const StateRef& x_ref)
129{
130 if (std::holds_alternative<ArrayXi>(x_ref))
131 return ArrayXi(x.cast<int>());
132 else if (std::holds_alternative<ArrayXb>(x_ref))
133 return ArrayXb(x.cast<bool>());
134
135 return x;
136}
137
139Dataset Dataset::operator()(const vector<size_t>& idx) const
140{
141 std::map<std::string, State> new_features;
142 for (auto& [key, value] : this->features)
143 {
144 auto& k = key;
145 std::visit([&](auto&& arg)
146 {
147 using T = std::decay_t<decltype(arg)>;
148 if constexpr ( T::NumDimensions == 1)
149 new_features[k] = T(arg(idx));
150 else if constexpr (T::NumDimensions==2)
151 new_features[k] = T(arg(idx, Eigen::all));
152 else
153 static_assert(always_false_v<T>, "non-exhaustive visitor!");
154 },
155 value
156 );
157 }
158 ArrayXf new_y;
159 if (this->y.size()>0)
160 {
161 new_y = this->y(idx);
162 }
163 // using constructor 1
164 Dataset result(new_features, new_y, this->classification);
165 // Preserve the original feature name and type order from parent
166 // The constructor's init() may have reordered them, so we fix that here
167 result.feature_names = this->feature_names;
168 result.feature_types = this->feature_types;
169 result.feature_name_order_ = this->feature_names;
170
171 // Rebuild features_of_type to match the corrected order
172 result.features_of_type.clear();
173 result.unique_data_types.clear();
174 for (size_t i = 0; i < result.feature_names.size(); ++i) {
175 const auto& name = result.feature_names[i];
176 const auto& ftype = result.feature_types[i];
177 result.features_of_type[ftype].push_back(name);
179 }
180
181 return result;
182}
183
184
185// TODO: i need to improve how get batch works. Maybe a function to update batch indexes, and always using the same dataset?
186// TODO: also, i need to make sure the get batch will sample only from training data and not test
188{
189 // will always return a new dataset, even when use_batch is false (this case, returns itself)
190
191 if (!use_batch)
192 return (*this);
193
194 auto n_samples = int(this->get_n_samples());
195 // garantee that at least one sample is going to be returned, since
196 // use_batch is true only if batch_size is (0, 1), and ceil will round
197 // up
198 n_samples = int(ceil(n_samples*batch_size));
199
200 return (*this)(r.shuffled_index(n_samples));
201}
202
203array<Dataset, 2> Dataset::split(const ArrayXb& mask) const
204{
205 // TODO: assert that mask is not filled with zeros or ones (would create
206 // one empty partition)
207
208 // split data into two based on mask.
209 auto idx1 = Util::mask_to_index(mask);
210 auto idx2 = Util::mask_to_index((!mask));
211 return std::array<Dataset, 2>{ (*this)(idx1), (*this)(idx2) };
212}
213
216
217vector<string> Dataset::get_feature_types() const {
218 // iterate through each feature name in order, get the data type, and return it. This is
219 // used in the python front-end to save the feature types from the training dataset
220 // when calling predict.
221
222 vector<string> python_feature_types;
223 // Iterate through feature_names to preserve order, not through features map
224 for (const auto& name: this->feature_names)
225 {
226 // fmt::print("name:{}\n",name);
227 const auto& value = this->features.at(name);
228
229 // save feature types
230 auto feature_type = StateType(value);
231
232 if (feature_type == DataType::ArrayB)
233 python_feature_types.push_back("ArrayB");
234 else if (feature_type == DataType::ArrayI)
235 python_feature_types.push_back("ArrayI");
236 else if (feature_type == DataType::ArrayF)
237 python_feature_types.push_back("ArrayF");
238 else
240 "get_feature_type does not support the type of this feature yet: " + name +
241 "as a notice, this function is suposed to be used in the python side, to extract data types inferred by Brush type sniffer.");
242 }
243
244 return python_feature_types;
245}
246
247
251{
252 //TODO: populate feature_names, var_data_types, data_types, features_of_type
253 // n_features = this->features.size();
254 // note this will have to change in unsupervised settings
255 // n_samples = this->y.size();
256
257 if (this->features.size() == 0){
259 fmt::format("Error during the initialization of the dataset. It "
260 "does not contain any data\n")
261 );
262 }
263
264 // fmt::print("Dataset::init()\n");
265 // Use the stored original feature name order if available, otherwise iterate through the map.
266 // IMPORTANT: use a value (not a const-ref) to avoid a dangling reference from binding
267 // a const-ref to the temporary produced by a mixed lvalue/prvalue ternary expression.
268 vector<string> names_to_use;
269 if (this->feature_name_order_.empty()) {
270 for (const auto& [name, value] : this->features) {
271 names_to_use.push_back(name);
272 }
273 } else {
274 names_to_use = this->feature_name_order_;
275 }
276
277 for (const auto& name : names_to_use)
278 {
279 // fmt::print("name:{}\n",name);
280 const auto& value = this->features.at(name);
281
282 // save feature types
283 auto feature_type = StateType(value);
284
286 feature_types.push_back( feature_type );
287
288 // add feature to appropriate map list
289 this->features_of_type[feature_type].push_back(name);
290
291 // populate feature names in the original order
292 this->feature_names.push_back(name);
293 }
294
295 // setting the training and validation data indexes
296 auto n_samples = int(this->get_n_samples());
297
298 training_data_idx.resize(0);
299 validation_data_idx.resize(0);
300
301 if (!use_validation)
302 {
303 vector<size_t> idx(n_samples);
304
305 std::iota(idx.begin(), idx.end(), 0);
306
307 std::transform(idx.begin(), idx.end(),
308 back_inserter(training_data_idx),
309 [&](int element) { return element; });
310
311 std::transform(idx.begin(), idx.end(),
312 back_inserter(validation_data_idx),
313 [&](int element) { return element; });
314 }
315 else if (classification && true) // figuring out training and validation data indexes
316 { // Stratified split for classification problems. TODO: parameters to change stratify behavior? (and set false by default)
317 std::map<float, vector<int>> class_indices; // TODO: I think I can remove many std:: from the code..
318 for (size_t i = 0; i < n_samples; ++i) {
319 class_indices[y[i]].push_back(i);
320 }
321
322 for (auto& class_group : class_indices) {
323 auto& indices = class_group.second;
324
325 int n_class_samples = indices.size();
326
327 vector<size_t> idx(n_class_samples);
328 if (shuffle_split)
329 idx = r.shuffled_index(n_class_samples);
330 else
331 std::iota(idx.begin(), idx.end(), 0);
332
333 auto n_train_samples = int(ceil(n_class_samples*(1.0-validation_size)));
334
335 std::transform(idx.begin(), idx.begin() + n_train_samples,
336 back_inserter(training_data_idx),
337 [&](int element) { return indices[element]; });
338
339 if (n_class_samples - n_train_samples == 0)
340 {
341 // same indices from the training data to the validation data
342 std::transform(idx.begin(), idx.begin() + n_train_samples,
343 back_inserter(validation_data_idx),
344 [&](int element) { return indices[element]; });
345 }
346 else
347 {
348 std::transform(idx.begin() + n_train_samples, idx.end(),
349 back_inserter(validation_data_idx),
350 [&](int element) { return indices[element]; });
351 }
352 }
353 }
354 else { // regression, or classification without stratification
355 // logic for non-classification problems
356 vector<size_t> idx(n_samples);
357
358 if (shuffle_split) // TODO: make sure this works with multiple threads and fixed random state
359 idx = r.shuffled_index(n_samples);
360 else
361 std::iota(idx.begin(), idx.end(), 0);
362
363 // garantee that at least one sample is going to be returned, since
364 // use_batch is true only if batch_size is (0, 1), and ceil will round
365 // up
366 auto n_train_samples = int(ceil(n_samples*(1-validation_size)));
367
368 std::transform(idx.begin(), idx.begin() + n_train_samples,
369 back_inserter(training_data_idx),
370 [&](int element) { return element; });
371
372 if (n_samples - n_train_samples == 0) { // training_data_idx contains all data
374 }
375 else
376 {
377 std::transform(idx.begin() + n_train_samples, idx.end(),
378 back_inserter(validation_data_idx),
379 [&](int element) { return element; });
380 }
381 }
382}
383
385void Dataset::set_batch_size(float new_size) {
386 batch_size = new_size;
387 use_batch = batch_size > 0.0 && batch_size < 1.0;
388}
389
391map<string, State> Dataset::make_features(const ArrayXXf& X,
392 const map<string,State>& Z,
393 const vector<string>& vn,
394 const vector<string>& ft
395 )
396{
397 // fmt::print("Dataset::make_features()\n");
398 map<string, State> tmp_features;
399
400 // fmt::print("vn: {}\n",vn);
401
402 // check variable names
403 vector<string> tmp_feature_names = {};
404 if (vn.empty())
405 {
406 // fmt::print("vn empty\n");
407 for (int i = 0; i < X.cols(); ++i)
408 {
409 string v = "x_"+to_string(i);
410 tmp_feature_names.push_back(v);
411 }
412 }
413 else
414 {
415 if (vn.size() != X.cols())
417 fmt::format("Variable names and data size mismatch: "
418 "{} variable names and {} features in X",
419 vn.size(), X.cols()) );
420 tmp_feature_names = vn;
421 }
422
423 // check variable types
424 vector<string> var_types;
425 if (ft.empty())
426 {
427 for (int i = 0; i < X.cols(); ++i)
428 {
429 var_types.push_back("");
430 }
431 }
432 else {
433 if (ft.size() != X.cols())
435 fmt::format("Feature type names and data size mismatch: "
436 "{} feature type names and {} features in X",
437 ft.size(), X.cols()) );
438
439 var_types = ft;
440 }
441
442 for (int i = 0; i < X.cols(); ++i)
443 {
444 // fmt::print("X({}): {} \n",i,tmp_feature_names.at(i));
445 State tmp = check_type(X.col(i).array(), var_types.at(i));
446
447 tmp_features[tmp_feature_names.at(i)] = tmp;
448 }
449 // fmt::print("tmp_features insert\n");
450 tmp_features.insert(Z.begin(), Z.end());
451
452 // Store the original feature name order (X features first, then Z features)
453 this->feature_name_order_ = tmp_feature_names;
454 for (const auto& [name, value] : Z) {
455 this->feature_name_order_.push_back(name);
456 }
457
458 return tmp_features;
459};
460
462map<string,State> Dataset::copy_and_make_features(const ArrayXXf& X,
463 const Dataset& ref_dataset,
464 const vector<string>& vn
465 )
466{
467 vector<string> tmp_feature_names = {};
468 if (vn.empty())
469 {
470 for (int i = 0; i < X.cols(); ++i)
471 {
472 string v = "x_"+to_string(i);
473 tmp_feature_names.push_back(v);
474 }
475 }
476 else
477 {
478 if (vn.size() != X.cols())
480 fmt::format("Variable names and data size mismatch: "
481 "{} variable names and {} features in X",
482 vn.size(),
483 X.cols()
484 )
485 );
486 tmp_feature_names = vn;
487 }
488
489 if (ref_dataset.features.size() != tmp_feature_names.size())
491 fmt::format("Reference dataset with incompatible number of variables: "
492 "Reference has {} variable names, but X has {}",
493 ref_dataset.features.size(),
494 tmp_feature_names.size()
495 )
496 );
497
498 map<string, State> tmp_features;
499 for (int i = 0; i < X.cols(); ++i)
500 {
501 State tmp = cast_type(
502 X.col(i).array(),
503 ref_dataset.features.at(tmp_feature_names.at(i))
504 );
505
506 tmp_features[tmp_feature_names.at(i)] = tmp;
507 }
508
509 // Store the original feature name order
510 this->feature_name_order_ = tmp_feature_names;
511
512 return tmp_features;
513};
514
515ostream& operator<<(ostream& os, DataType dt)
516{
517 os << DataTypeName[dt];
518 return os;
519}
520
521} // data
522} // Brush
bool classification
whether this is a classification problem
Definition data.h:85
vector< string > feature_name_order_
stores the original feature name order before map sorting
Definition data.h:60
Dataset get_validation_data() const
Definition data.cpp:215
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:76
int get_n_samples() const
Definition data.h:225
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:117
vector< size_t > training_data_idx
Definition data.h:57
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:187
vector< string > get_feature_types() const
Definition data.cpp:217
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:67
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:73
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:94
bool use_validation
Definition data.h:90
std::vector< string > feature_names
names of the feature types as string representations.
Definition data.h:70
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:64
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:462
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
Definition data.cpp:391
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:250
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:89
vector< size_t > validation_data_idx
Definition data.h:58
ArrayXf y
length N array, the target label
Definition data.h:82
float get_batch_size()
Definition data.cpp:384
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:203
Dataset get_training_data() const
Definition data.cpp:214
void set_batch_size(float new_size)
Definition data.cpp:385
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:139
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
namespace containing Data structures used in Brush
Definition data.cpp:49
std::vector< DataType > StateTypes
Definition data.cpp:54
TimeSeries< bool > TimeSeriesb
TimeSeries convenience typedefs.
Definition types.h:110
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:128
TimeSeries< float > TimeSeriesf
Definition types.h:112
DataType StateType(const State &arg)
Definition data.cpp:67
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
Definition data.cpp:71
ostream & operator<<(ostream &os, DataType dt)
Definition data.cpp:515
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
TimeSeries< int > TimeSeriesi
Definition types.h:111
namespace containing various utility functions
Definition error.cpp:11
static map< V, K > reverse_map(const map< K, V > &m)
Given a map from keys to values, creates a new map from values to keys.
Definition utils.h:752
string to_string(const T &value)
template function to convert objects to string for logging
Definition utils.h:369
void unique_insert(Vector &v, const T &t)
unique insertion into a vector. allows a vector to be used like a set. source: http://www....
Definition utils.h:684
static Rnd & r
Definition rnd.h:176
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
Definition utils.cpp:409
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Eigen::Array< int, Eigen::Dynamic, Eigen::Dynamic > ArrayXXi
Definition types.h:42
DataType
data types.
Definition types.h:143
constexpr bool always_false_v
Definition init.h:67
Eigen::Array< bool, Eigen::Dynamic, Eigen::Dynamic > ArrayXXb
Definition types.h:41
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34