Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.cpp
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6//internal includes
7#include "data.h"
8
9using namespace Brush::Util;
10using std::min;
11
12namespace Brush{
13
14map<DataType,string> DataTypeName = {
15 {DataType::ArrayB, "ArrayB"},
16 {DataType::ArrayI, "ArrayI"},
17 {DataType::ArrayF, "ArrayF"},
18 {DataType::MatrixB, "MatrixB"},
19 {DataType::MatrixI, "MatrixI"},
20 {DataType::MatrixF, "MatrixF"},
21 {DataType::TimeSeriesB, "TimeSeriesB"},
22 {DataType::TimeSeriesI,"TimeSeriesI"},
23 {DataType::TimeSeriesF, "TimeSeriesF"},
24 {DataType::ArrayBJet, "ArrayBJet"},
25 {DataType::ArrayIJet, "ArrayIJet"},
26 {DataType::ArrayFJet, "ArrayFJet"},
27 {DataType::MatrixBJet, "MatrixBJet"},
28 {DataType::MatrixIJet, "MatrixIJet"},
29 {DataType::MatrixFJet, "MatrixFJet"},
30 {DataType::TimeSeriesBJet, "TimeSeriesBJet"},
31 {DataType::TimeSeriesIJet,"TimeSeriesIJet"},
32 {DataType::TimeSeriesFJet, "TimeSeriesFJet"}
33};
35
36const map<DataType,std::type_index> DataTypeID = {
37 {DataType::ArrayB, typeid(ArrayXb)},
38 {DataType::ArrayI, typeid(ArrayXi)},
39 {DataType::ArrayF, typeid(ArrayXf)},
40 {DataType::MatrixB, typeid(ArrayXXb)},
41 {DataType::MatrixI, typeid(ArrayXXi)},
42 {DataType::MatrixF, typeid(ArrayXXf)},
46};
47map<std::type_index,DataType> DataIDType = Util::reverse_map(DataTypeID);
48
49namespace Data{
50
51// we have 3 basic types (bool, integer, float), specialized into
52// arrays, matrices, and timeseries. Notice that all dataset and operators
53// right now only work with arrays. TODO: implement timeseries and matrices.
65
66// /// returns the type_index held in arg
68{
69 return StateTypes.at(arg.index());
70}
71State check_type(const ArrayXf& x, const string t)
72{
73 State tmp;
74
75 if (!t.empty())
76 {
77 // Use DataNameType to get the statetype given the string representation
78 DataType feature_type = DataNameType.at(t);
79
80 if (feature_type == DataType::ArrayB)
81 tmp = ArrayXb(x.cast<bool>());
82 else if (feature_type == DataType::ArrayI)
83 tmp = ArrayXi(x.cast<int>());
84 else if (feature_type == DataType::ArrayF)
85 tmp = ArrayXf(x.cast<float>());
86 else
88 "Invalid feature type. check_type does not support this type: " + t);
89 }
90 else
91 {
92 // get feature types (binary or continuous/categorical)
93 bool isBinary = true;
94 bool isCategorical = true;
95
96 std::map<float, bool> uniqueMap;
97 for(int i = 0; i < x.size(); i++)
98 {
99 if(x(i) != 0 && x(i) != 1)
100 isBinary = false;
101 if(x(i) != floor(x(i)) && x(i) != ceil(x(i)))
102 isCategorical = false;
103 else
104 uniqueMap[x(i)] = true;
105 }
106
107
108 if (isBinary)
109 {
110 tmp = ArrayXb(x.cast<bool>());
111 }
112 else
113 {
114 if(isCategorical && uniqueMap.size() <= 10)
115 {
116 tmp = ArrayXi(x.cast<int>());
117 }
118 else
119 {
120 tmp = x;
121 }
122 }
123 }
124
125 return tmp;
126}
127
128template<typename StateRef>
129State cast_type(const ArrayXf& x, const StateRef& x_ref)
130{
131 if (std::holds_alternative<ArrayXi>(x_ref))
132 return ArrayXi(x.cast<int>());
133 else if (std::holds_alternative<ArrayXb>(x_ref))
134 return ArrayXb(x.cast<bool>());
135
136 return x;
137}
138
140Dataset Dataset::operator()(const vector<size_t>& idx) const
141{
142 std::map<std::string, State> new_features;
143 for (auto& [key, value] : this->features)
144 {
145 auto& k = key;
146 std::visit([&](auto&& arg)
147 {
148 using T = std::decay_t<decltype(arg)>;
149 if constexpr ( T::NumDimensions == 1)
150 new_features[k] = T(arg(idx));
151 else if constexpr (T::NumDimensions==2)
152 new_features[k] = T(arg(idx, Eigen::all));
153 else
154 static_assert(always_false_v<T>, "non-exhaustive visitor!");
155 },
156 value
157 );
158 }
159 ArrayXf new_y;
160 if (this->y.size()>0)
161 {
162 new_y = this->y(idx);
163 }
164 // using constructor 1
165 return Dataset(new_features, new_y, this->classification);
166}
167
168
169// TODO: i need to improve how get batch works. Maybe a function to update batch indexes, and always using the same dataset?
170// TODO: also, i need to make sure the get batch will sample only from training data and not test
172{
173 // will always return a new dataset, even when use_batch is false (this case, returns itself)
174
175 if (!use_batch)
176 return (*this);
177
178 auto n_samples = int(this->get_n_samples());
179 // garantee that at least one sample is going to be returned, since
180 // use_batch is true only if batch_size is (0, 1), and ceil will round
181 // up
182 n_samples = int(ceil(n_samples*batch_size));
183
184 return (*this)(r.shuffled_index(n_samples));
185}
186
187array<Dataset, 2> Dataset::split(const ArrayXb& mask) const
188{
189 // TODO: assert that mask is not filled with zeros or ones (would create
190 // one empty partition)
191
192 // split data into two based on mask.
193 auto idx1 = Util::mask_to_index(mask);
194 auto idx2 = Util::mask_to_index((!mask));
195 return std::array<Dataset, 2>{ (*this)(idx1), (*this)(idx2) };
196}
197
200
204{
205 //TODO: populate feature_names, var_data_types, data_types, features_of_type
206 // n_features = this->features.size();
207 // note this will have to change in unsupervised settings
208 // n_samples = this->y.size();
209
210 if (this->features.size() == 0){
212 fmt::format("Error during the initialization of the dataset. It "
213 "does not contain any data\n")
214 );
215 }
216
217 // fmt::print("Dataset::init()\n");
218 for (const auto& [name, value]: this->features)
219 {
220 // fmt::print("name:{}\n",name);
221 // save feature types
222 auto feature_type = StateType(value);
223
225 feature_types.push_back( feature_type );
226
227 // add feature to appropriate map list
228 this->features_of_type[feature_type].push_back(name);
229 }
230
231 // setting the training and validation data indexes
232 auto n_samples = int(this->get_n_samples());
233
234 training_data_idx.resize(0);
235 validation_data_idx.resize(0);
236
237 if (!use_validation)
238 {
239 vector<size_t> idx(n_samples);
240
241 std::iota(idx.begin(), idx.end(), 0);
242
243 std::transform(idx.begin(), idx.end(),
244 back_inserter(training_data_idx),
245 [&](int element) { return element; });
246
247 std::transform(idx.begin(), idx.end(),
248 back_inserter(validation_data_idx),
249 [&](int element) { return element; });
250 }
251 else if (classification && true) // figuring out training and validation data indexes
252 { // Stratified split for classification problems. TODO: parameters to change stratify behavior? (and set false by default)
253 std::map<float, vector<int>> class_indices; // TODO: I think I can remove many std:: from the code..
254 for (size_t i = 0; i < n_samples; ++i) {
255 class_indices[y[i]].push_back(i);
256 }
257
258 for (auto& class_group : class_indices) {
259 auto& indices = class_group.second;
260
261 int n_class_samples = indices.size();
262
263 vector<size_t> idx(n_class_samples);
264 if (shuffle_split)
265 idx = r.shuffled_index(n_class_samples);
266 else
267 std::iota(idx.begin(), idx.end(), 0);
268
269 auto n_train_samples = int(ceil(n_class_samples*(1.0-validation_size)));
270
271 std::transform(idx.begin(), idx.begin() + n_train_samples,
272 back_inserter(training_data_idx),
273 [&](int element) { return indices[element]; });
274
275 if (n_class_samples - n_train_samples == 0)
276 {
277 // same indices from the training data to the validation data
278 std::transform(idx.begin(), idx.begin() + n_train_samples,
279 back_inserter(validation_data_idx),
280 [&](int element) { return indices[element]; });
281 }
282 else
283 {
284 std::transform(idx.begin() + n_train_samples, idx.end(),
285 back_inserter(validation_data_idx),
286 [&](int element) { return indices[element]; });
287 }
288 }
289 }
290 else { // regression, or classification without stratification
291 // logic for non-classification problems
292 vector<size_t> idx(n_samples);
293
294 if (shuffle_split) // TODO: make sure this works with multiple threads and fixed random state
295 idx = r.shuffled_index(n_samples);
296 else
297 std::iota(idx.begin(), idx.end(), 0);
298
299 // garantee that at least one sample is going to be returned, since
300 // use_batch is true only if batch_size is (0, 1), and ceil will round
301 // up
302 auto n_train_samples = int(ceil(n_samples*(1-validation_size)));
303
304 std::transform(idx.begin(), idx.begin() + n_train_samples,
305 back_inserter(training_data_idx),
306 [&](int element) { return element; });
307
308 if (n_samples - n_train_samples == 0) { // training_data_idx contains all data
310 }
311 else
312 {
313 std::transform(idx.begin() + n_train_samples, idx.end(),
314 back_inserter(validation_data_idx),
315 [&](int element) { return element; });
316 }
317 }
318}
319
321void Dataset::set_batch_size(float new_size) {
322 batch_size = new_size;
323 use_batch = batch_size > 0.0 && batch_size < 1.0;
324}
325
327map<string, State> Dataset::make_features(const ArrayXXf& X,
328 const map<string,State>& Z,
329 const vector<string>& vn,
330 const vector<string>& ft
331 )
332{
333 // fmt::print("Dataset::make_features()\n");
334 map<string, State> tmp_features;
335
336 // fmt::print("vn: {}\n",vn);
337
338 // check variable names
339 feature_names.resize(0);
340 if (vn.empty())
341 {
342 // fmt::print("vn empty\n");
343 for (int i = 0; i < X.cols(); ++i)
344 {
345 string v = "x_"+to_string(i);
346 feature_names.push_back(v);
347 }
348 }
349 else
350 {
351 if (vn.size() != X.cols())
353 fmt::format("Variable names and data size mismatch: "
354 "{} variable names and {} features in X",
355 vn.size(), X.cols()) );
356 feature_names = vn;
357 }
358
359 // check variable types
360 vector<string> var_types;
361 if (ft.empty())
362 {
363 for (int i = 0; i < X.cols(); ++i)
364 {
365 var_types.push_back("");
366 }
367 }
368 else {
369 if (ft.size() != X.cols())
371 fmt::format("Feature type names and data size mismatch: "
372 "{} feature type names and {} features in X",
373 ft.size(), X.cols()) );
374 var_types = ft;
375 }
376
377 for (int i = 0; i < X.cols(); ++i)
378 {
379 // fmt::print("X({}): {} \n",i,feature_names.at(i));
380 State tmp = check_type(X.col(i).array(), var_types.at(i));
381
382 tmp_features[feature_names.at(i)] = tmp;
383 }
384 // fmt::print("tmp_features insert\n");
385 tmp_features.insert(Z.begin(), Z.end());
386
387 return tmp_features;
388};
389
391map<string,State> Dataset::copy_and_make_features(const ArrayXXf& X,
392 const Dataset& ref_dataset,
393 const vector<string>& vn
394 )
395{
396 feature_names.resize(0);
397 if (vn.empty())
398 {
399 for (int i = 0; i < X.cols(); ++i)
400 {
401 string v = "x_"+to_string(i);
402 feature_names.push_back(v);
403 }
404 }
405 else
406 {
407 if (vn.size() != X.cols())
409 fmt::format("Variable names and data size mismatch: "
410 "{} variable names and {} features in X",
411 vn.size(),
412 X.cols()
413 )
414 );
415 feature_names = vn;
416 }
417
418 if (ref_dataset.features.size() != feature_names.size())
420 fmt::format("Reference dataset with incompatible number of variables: "
421 "Reference has {} variable names, but X has {}",
422 ref_dataset.features.size(),
423 feature_names.size()
424 )
425 );
426
427 map<string, State> tmp_features;
428 for (int i = 0; i < X.cols(); ++i)
429 {
430 State tmp = cast_type(
431 X.col(i).array(),
432 ref_dataset.features.at(feature_names.at(i))
433 );
434
435 tmp_features[feature_names.at(i)] = tmp;
436 }
437
438 return tmp_features;
439};
440
441ostream& operator<<(ostream& os, DataType dt)
442{
443 os << DataTypeName[dt];
444 return os;
445}
446
447} // data
448} // Brush
bool classification
whether this is a classification problem
Definition data.h:83
Dataset get_validation_data() const
Definition data.cpp:199
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:74
int get_n_samples() const
Definition data.h:222
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:115
vector< size_t > training_data_idx
Definition data.h:57
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:171
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:65
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:71
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:92
bool use_validation
Definition data.h:88
std::vector< string > feature_names
names of the feature types as string representations.
Definition data.h:68
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:391
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
Definition data.cpp:327
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:203
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:87
vector< size_t > validation_data_idx
Definition data.h:58
ArrayXf y
length N array, the target label
Definition data.h:80
float get_batch_size()
Definition data.cpp:320
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:187
Dataset get_training_data() const
Definition data.cpp:198
void set_batch_size(float new_size)
Definition data.cpp:321
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:140
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
namespace containing Data structures used in Brush
Definition data.cpp:49
std::vector< DataType > StateTypes
Definition data.cpp:54
TimeSeries< bool > TimeSeriesb
TimeSeries convenience typedefs.
Definition types.h:110
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:129
TimeSeries< float > TimeSeriesf
Definition types.h:112
DataType StateType(const State &arg)
Definition data.cpp:67
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
Definition data.cpp:71
ostream & operator<<(ostream &os, DataType dt)
Definition data.cpp:441
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
TimeSeries< int > TimeSeriesi
Definition types.h:111
namespace containing various utility functions
Definition error.cpp:11
static map< V, K > reverse_map(const map< K, V > &m)
Given a map from keys to values, creates a new map from values to keys.
Definition utils.h:738
string to_string(const T &value)
template function to convert objects to string for logging
Definition utils.h:369
void unique_insert(Vector &v, const T &t)
unique insertion into a vector. allows a vector to be used like a set. source: http://www....
Definition utils.h:670
static Rnd & r
Definition rnd.h:174
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
Definition utils.cpp:409
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Eigen::Array< int, Eigen::Dynamic, Eigen::Dynamic > ArrayXXi
Definition types.h:42
DataType
data types.
Definition types.h:143
constexpr bool always_false_v
Definition init.h:67
Eigen::Array< bool, Eigen::Dynamic, Eigen::Dynamic > ArrayXXb
Definition types.h:41
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34