Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.cpp
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6//internal includes
7#include "data.h"
8
9using namespace Brush::Util;
10using std::min;
11
12namespace Brush{
13
14map<DataType,string> DataTypeName = {
15 {DataType::ArrayB, "ArrayB"},
16 {DataType::ArrayI, "ArrayI"},
17 {DataType::ArrayF, "ArrayF"},
18 {DataType::MatrixB, "MatrixB"},
19 {DataType::MatrixI, "MatrixI"},
20 {DataType::MatrixF, "MatrixF"},
21 {DataType::TimeSeriesB, "TimeSeriesB"},
22 {DataType::TimeSeriesI,"TimeSeriesI"},
23 {DataType::TimeSeriesF, "TimeSeriesF"},
24 {DataType::ArrayBJet, "ArrayBJet"},
25 {DataType::ArrayIJet, "ArrayIJet"},
26 {DataType::ArrayFJet, "ArrayFJet"},
27 {DataType::MatrixBJet, "MatrixBJet"},
28 {DataType::MatrixIJet, "MatrixIJet"},
29 {DataType::MatrixFJet, "MatrixFJet"},
30 {DataType::TimeSeriesBJet, "TimeSeriesBJet"},
31 {DataType::TimeSeriesIJet,"TimeSeriesIJet"},
32 {DataType::TimeSeriesFJet, "TimeSeriesFJet"}
33};
35
36const map<DataType,std::type_index> DataTypeID = {
37 {DataType::ArrayB, typeid(ArrayXb)},
38 {DataType::ArrayI, typeid(ArrayXi)},
39 {DataType::ArrayF, typeid(ArrayXf)},
40 {DataType::MatrixB, typeid(ArrayXXb)},
41 {DataType::MatrixI, typeid(ArrayXXi)},
42 {DataType::MatrixF, typeid(ArrayXXf)},
46};
47map<std::type_index,DataType> DataIDType = Util::reverse_map(DataTypeID);
48
49namespace Data{
50
62
63// /// returns the type_index held in arg
65{
66 return StateTypes.at(arg.index());
67}
68State check_type(const ArrayXf& x)
69{
70 // get feature types (binary or continuous/categorical)
71 bool isBinary = true;
72 bool isCategorical = true;
73 std::map<float, bool> uniqueMap;
74 for(int i = 0; i < x.size(); i++)
75 {
76
77 if(x(i) != 0 && x(i) != 1)
78 isBinary = false;
79 if(x(i) != floor(x(i)) && x(i) != ceil(x(i)))
80 isCategorical = false;
81 else
82 uniqueMap[x(i)] = true;
83 }
84
85 State tmp; // = x;
86
87 if (isBinary)
88 {
89 tmp = ArrayXb(x.cast<bool>());
90 }
91 else
92 {
93 if(isCategorical && uniqueMap.size() <= 10)
94 {
95 tmp = ArrayXi(x.cast<int>());
96 }
97 else
98 {
99 tmp = x;
100 }
101 }
102 return tmp;
103}
104
105template<typename StateRef>
106State cast_type(const ArrayXf& x, const StateRef& x_ref)
107{
108 if (std::holds_alternative<ArrayXi>(x_ref))
109 return ArrayXi(x.cast<int>());
110 else if (std::holds_alternative<ArrayXb>(x_ref))
111 return ArrayXb(x.cast<bool>());
112
113 return x;
114}
115
117Dataset Dataset::operator()(const vector<size_t>& idx) const
118{
119 std::map<std::string, State> new_features;
120 for (auto& [key, value] : this->features)
121 {
122 std::visit([&](auto&& arg)
123 {
124 using T = std::decay_t<decltype(arg)>;
125 if constexpr ( T::NumDimensions == 1)
126 new_features[key] = T(arg(idx));
127 else if constexpr (T::NumDimensions==2)
128 new_features[key] = T(arg(idx, Eigen::all));
129 else
130 static_assert(always_false_v<T>, "non-exhaustive visitor!");
131 },
132 value
133 );
134 }
135 ArrayXf new_y;
136 if (this->y.size()>0)
137 {
138 new_y = this->y(idx);
139 }
141}
142
143
144// TODO: i need to improve how get batch works. Maybe a function to update batch indexes, and always using the same dataset?
145// TODO: also, i need to make sure the get batch will sample only from training data and not test
147{
148 // will always return a new dataset, even when use_batch is false (this case, returns itself)
149
150 if (!use_batch)
151 return (*this);
152
153 auto n_samples = int(this->get_n_samples());
154 // garantee that at least one sample is going to be returned, since
155 // use_batch is true only if batch_size is (0, 1), and ceil will round
156 // up
158
159 return (*this)(r.shuffled_index(n_samples));
160}
161
162array<Dataset, 2> Dataset::split(const ArrayXb& mask) const
163{
164 // TODO: assert that mask is not filled with zeros or ones (would create
165 // one empty partition)
166
167 // split data into two based on mask.
169 auto idx2 = Util::mask_to_index((!mask));
170 return std::array<Dataset, 2>{ (*this)(idx1), (*this)(idx2) };
171}
172
175
179{
180 //TODO: populate var_names, var_data_types, data_types, features_of_type
181 // n_features = this->features.size();
182 // note this will have to change in unsupervised settings
183 // n_samples = this->y.size();
184
185 if (this->features.size() == 0){
187 fmt::format("Error during the initialization of the dataset. It "
188 "does not contain any data\n")
189 );
190 }
191
192 // fmt::print("Dataset::init()\n");
193 for (const auto& [name, value]: this->features)
194 {
195 // fmt::print("name:{}\n",name);
196 // save feature types
197 auto feature_type = StateType(value);
198
200 feature_types.push_back( feature_type);
201 // add feature to appropriate map list
202 this->features_of_type[feature_type].push_back(name);
203 }
204
205 // setting the training and validation data indexes
206 auto n_samples = int(this->get_n_samples());
208
209 // garantee that at least one sample is going to be returned, since
210 // use_batch is true only if batch_size is (0, 1), and ceil will round
211 // up
213
214 training_data_idx.resize(0);
215 std::transform(idx.begin(), idx.begin() + n_train_samples,
217 [&](int element) { return element; });
218
219 if ( use_validation && (n_samples - n_train_samples != 0) ) {
220 validation_data_idx.resize(0);
221 std::transform(idx.begin() + n_train_samples, idx.end(),
223 [&](int element) { return element; });
224 }
225 else {
227 }
228}
229
230// TODO: use integer instead of percentage (or even better, have both)
234 use_batch = batch_size > 0.0 && batch_size < 1.0;
235}
236
238map<string, State> Dataset::make_features(const ArrayXXf& X,
239 const map<string,State>& Z,
240 const vector<string>& vn
241 )
242{
243 // fmt::print("Dataset::make_features()\n");
244 map<string, State> tmp_features;
245 vector<string> var_names;
246 // fmt::print("vn: {}\n",vn);
247 // check variable names
248 if (vn.empty())
249 {
250 // fmt::print("vn empty\n");
251 for (int i = 0; i < X.cols(); ++i)
252 {
253 string v = "x_"+to_string(i);
254 var_names.push_back(v);
255 }
256 }
257 else
258 {
259 if (vn.size() != X.cols())
261 fmt::format("Variable names and data size mismatch: "
262 "{} variable names and {} features in X",
263 vn.size(),
264 X.cols()
265 )
266 );
267 var_names = vn;
268 }
269
270 for (int i = 0; i < X.cols(); ++i)
271 {
272 // fmt::print("X({}): {} \n",i,var_names.at(i));
273 State tmp = check_type(X.col(i).array());
274
276 }
277 // fmt::print("tmp_features insert\n");
278 tmp_features.insert(Z.begin(), Z.end());
279 return tmp_features;
280};
281
283map<string,State> Dataset::copy_and_make_features(const ArrayXXf& X,
284 const Dataset& ref_dataset,
285 const vector<string>& vn
286 )
287{
288 vector<string> var_names;
289 if (vn.empty())
290 {
291 for (int i = 0; i < X.cols(); ++i)
292 {
293 string v = "x_"+to_string(i);
294 var_names.push_back(v);
295 }
296 }
297 else
298 {
299 if (vn.size() != X.cols())
301 fmt::format("Variable names and data size mismatch: "
302 "{} variable names and {} features in X",
303 vn.size(),
304 X.cols()
305 )
306 );
307 var_names = vn;
308 }
309
310 if (ref_dataset.features.size() != var_names.size())
312 fmt::format("Reference dataset with incompatible number of variables: "
313 "Reference has {} variable names, but X has {}",
314 ref_dataset.features.size(),
315 var_names.size()
316 )
317 );
318
319 map<string, State> tmp_features;
320 for (int i = 0; i < X.cols(); ++i)
321 {
323 X.col(i).array(),
324 ref_dataset.features.at(var_names.at(i))
325 );
326
328 }
329
330 return tmp_features;
331};
332
333ostream& operator<<(ostream& os, DataType dt)
334{
335 os << DataTypeName[dt];
336 return os;
337}
338
339} // data
340} // Brush
void bind_engine(py::module &m, string name)
holds variable type data.
Definition data.h:51
bool classification
whether this is a classification problem
Definition data.h:79
Dataset get_validation_data() const
Definition data.cpp:174
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:71
int get_n_samples() const
Definition data.h:209
vector< size_t > training_data_idx
Definition data.h:57
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:146
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:65
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:68
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:87
bool use_validation
Definition data.h:84
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:283
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0)
Definition data.h:110
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:178
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:83
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={})
turns input data into a feature map
Definition data.cpp:238
vector< size_t > validation_data_idx
Definition data.h:58
ArrayXf y
length N array, the target label
Definition data.h:76
float get_batch_size()
Definition data.cpp:231
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:162
Dataset get_training_data() const
Definition data.cpp:173
void set_batch_size(float new_size)
Definition data.cpp:232
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:117
vector< size_t > shuffled_index(size_t n)
returns a shuffled index vector of length n
Definition rnd.cpp:121
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
std::vector< DataType > StateTypes
Definition data.cpp:51
State check_type(const ArrayXf &x)
determines data types of columns of matrix X.
Definition data.cpp:68
TimeSeries< bool > TimeSeriesb
TimeSeries convenience typedefs.
Definition types.h:110
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:106
TimeSeries< float > TimeSeriesf
Definition types.h:112
DataType StateType(const State &arg)
Definition data.cpp:64
ostream & operator<<(ostream &os, DataType dt)
Definition data.cpp:333
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
TimeSeries< int > TimeSeriesi
Definition types.h:111
namespace containing various utility functions
Definition error.cpp:11
static map< V, K > reverse_map(const map< K, V > &m)
Given a map from keys to values, creates a new map from values to keys.
Definition utils.h:738
string to_string(const T &value)
template function to convert objects to string for logging
Definition utils.h:369
void unique_insert(Vector &v, const T &t)
unique insertion into a vector. allows a vector to be used like a set. source: http://www....
Definition utils.h:670
static Rnd & r
Definition rnd.h:174
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
Definition utils.cpp:409
< nsga2 selection operator for getting the front
Definition data.cpp:12
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Eigen::Array< int, Eigen::Dynamic, Eigen::Dynamic > ArrayXXi
Definition types.h:42
DataType
data types.
Definition types.h:143
Eigen::Array< bool, Eigen::Dynamic, Eigen::Dynamic > ArrayXXb
Definition types.h:41
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34