Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.h
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6#ifndef DATA_H
7#define DATA_H
8
9// internal includes
10#include "../init.h"
11#include "../util/utils.h"
12#include "../util/error.h"
13#include "../util/logger.h"
14#include "../util/rnd.h"
15#include "timeseries.h"
16//external includes
17#include <variant>
18#include <optional>
19
20namespace Brush
21{
22
23extern map<DataType,string> DataTypeName;
24extern map<string,DataType> DataNameType;
25ostream& operator<<(ostream& os, DataType n);
26
27
28namespace Data
29{
34
35
37State check_type(const ArrayXf& x, const string t);
38DataType StateType(const State& arg);
39
40template<typename StateRef>
41State cast_type(const ArrayXf& x, const StateRef& x_ref);
42
44
50class Dataset
51{
52 //TODO: make this a json object that allows elements to be fetched by
53 //name
54 //Dataset(ArrayXXf& X, ArrayXf& y, std::map<string,
55 //std::pair<vector<ArrayXf>, vector<ArrayXf>>>& Z): X(X), y(y), Z(Z){}
56 private:
57 vector<size_t> training_data_idx;
58 vector<size_t> validation_data_idx;
59
60 public:
62 std::vector<DataType> unique_data_types;
63
65 std::vector<DataType> feature_types;
66
68 std::vector<string> feature_names; // TODO: remove?
69
71 std::unordered_map<DataType,vector<string>> features_of_type;
72
74 std::map<string, State> features;
75
76 // TODO: this should probably be a more complex type to include feature type
77 // and potentially other info, like arbitrary relations between features
78
80 ArrayXf y;
81
84 std::optional<std::reference_wrapper<const ArrayXXf>> Xref;
85
90
94
95 Dataset operator()(const vector<size_t>& idx) const;
96
99 void init();
100
102 map<string,State> make_features(const ArrayXXf& X,
103 const map<string, State>& Z = {},
104 const vector<string>& vn = {},
105 const vector<string>& ft = {}
106 );
107
109 map<string,State> copy_and_make_features(const ArrayXXf& X,
110 const Dataset& ref_dataset,
111 const vector<string>& vn = {}
112 );
113
115 Dataset(std::map<string, State>& d,
116 const Ref<const ArrayXf>& y_ = ArrayXf(),
117 bool c = false,
118 float validation_size = 0.0,
119 float batch_size = 1.0,
120 bool shuffle_split = false
121 )
122 : features(d)
123 , y(y_)
124 , classification(c)
128 , use_batch(batch_size > 0.0 && batch_size < 1.0)
130 {init();};
131
133 Dataset(const ArrayXXf& X,
134 const Ref<const ArrayXf>& y_ = ArrayXf(),
135 const vector<string>& vn = {},
136 const map<string, State>& Z = {},
137 const vector<string>& ft = {},
138 bool c = false,
139 float validation_size = 0.0,
140 float batch_size = 1.0,
141 bool shuffle_split = false
142 )
143 : features(make_features(X,Z,vn,ft))
144 , y(y_)
145 , classification(c)
149 , use_batch(batch_size > 0.0 && batch_size < 1.0)
151 {
152 init();
153 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
154 }
155
157 Dataset(const ArrayXXf& X, const vector<string>& vn,
158 const vector<string>& ft = {},
159 bool c = false,
160 float validation_size = 0.0,
161 float batch_size = 1.0,
162 bool shuffle_split = false
163 )
164 : classification(c)
165 , features(make_features(X,map<string, State>{},vn,ft))
169 , use_batch(batch_size > 0.0 && batch_size < 1.0)
171 {
172 init();
173 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
174 }
175
179 Dataset(const ArrayXXf& X, const Dataset& ref_dataset,
180 const vector<string>& vn
181 )
182 : classification(ref_dataset.classification)
183 , features(copy_and_make_features(X,ref_dataset,vn))
184 , validation_size(0.0)
185 , use_validation(false)
186 , batch_size(1.0)
187 , use_batch(false)
188 , shuffle_split(false)
189 {
190 init();
191 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
192 }
193
194 void print() const
195 {
196 fmt::print("Dataset contains {} samples and {} features\n",
198 );
199 for (auto& [key, value] : this->features)
200 {
201 if (std::holds_alternative<ArrayXf>(value))
202 fmt::print("{} <ArrayXf>: {}\n", key, std::get<ArrayXf>(value));
203 else if (std::holds_alternative<ArrayXi>(value))
204 fmt::print("{} <ArrayXi>: {}\n", key, std::get<ArrayXi>(value));
205 else if (std::holds_alternative<ArrayXb>(value))
206 fmt::print("{} <ArrayXb>: {}\n", key, std::get<ArrayXb>(value));
207 }
208
209 };
210 auto get_X() const
211 {
212 if (!Xref.has_value())
213 HANDLE_ERROR_THROW("Dataset does not hold a reference to X.");
214 return this->Xref.value().get();
215 }
216
217 // inner partition of original dataset for train and validation.
218 // if split is not set, then training = validation.
221
222 inline int get_n_samples() const {
223 return std::visit(
224 [&](auto&& arg) -> int { return int(arg.size());},
225 features.begin()->second
226 );
227 };
228 inline int get_n_features() const { return this->features.size(); };
230 Dataset get_batch() const;
231
232 float get_batch_size();
233 void set_batch_size(float new_size);
234
235 DataType get_feature_type(const string& name) const
236 {
237 DataType feature_type = DataType::ArrayF;
238
239 bool has_feature = false;
240 for (auto& [ftype, names] : features_of_type){
241 auto it = std::find_if(names.begin(), names.end(),
242 [&](const auto& name){ return name == name; });
243
244 if (it != names.end())
245 {
246 feature_type = ftype;
247 has_feature = true;
248 break;
249 }
250 }
251
252 if (!has_feature)
253 HANDLE_ERROR_THROW(fmt::format("Couldn't find feature {} in data\n",name));
254
255 return feature_type;
256 };
257
258 std::array<Dataset, 2> split(const ArrayXb& mask) const;
259
260 State operator[](std::string name) const
261 {
262 if (this->features.find(name) == features.end())
263 HANDLE_ERROR_THROW(fmt::format("Couldn't find feature {} in data\n",name));
264 return this->features.at(name);
265 };
266
267 /* template<> ArrayXb get<ArrayXb>(std::string name) */
268}; // class data
269
270// TODO: serialization of features in order to nlohmann to work
271// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Dataset,
272// features,
273// y,
274// classification,
275// validation_size,
276// use_validation,
277// batch_size,
278// use_batch,
279// shuffle_split
280// );
281
282// // read csv
283// Dataset read_csv(const std::string & path, MatrixXf& X, VectorXf& y,
284// vector<string>& names, vector<char> &dtypes, bool& binary_endpoint, char sep) ;
285
286} // data
287
288extern const map<DataType,std::type_index> DataTypeID;
289extern map<std::type_index,DataType> DataIDType;
290
291} // Brush
292
293// format overload for DataType
294template <> struct fmt::formatter<Brush::DataType>: formatter<string_view> {
295 template <typename FormatContext>
296 auto format(Brush::DataType x, FormatContext& ctx) const {
297 return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
298 }
299};
300
301// TODO: fmt overload for Data
302// template <> struct fmt::formatter<Brush::Data::Dataset>: formatter<string_view> {
303// template <typename FormatContext>
304// auto format(Brush::Data::Dataset& x, FormatContext& ctx) const {
305 // return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
306// }
307// };
308
309#endif
holds variable type data.
Definition data.h:51
Dataset(const ArrayXXf &X, const vector< string > &vn, const vector< string > &ft={}, bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:157
bool classification
whether this is a classification problem
Definition data.h:83
Dataset get_validation_data() const
Definition data.cpp:198
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:74
DataType get_feature_type(const string &name) const
Definition data.h:235
int get_n_samples() const
Definition data.h:222
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:115
vector< size_t > training_data_idx
Definition data.h:57
auto get_X() const
Definition data.h:210
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:170
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:65
void print() const
Definition data.h:194
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:71
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:92
bool use_validation
Definition data.h:88
int get_n_features() const
Definition data.h:228
Dataset(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn)
Definition data.h:179
State operator[](std::string name) const
Definition data.h:260
std::optional< std::reference_wrapper< const ArrayXXf > > Xref
Definition data.h:84
std::vector< string > feature_names
names of the feature types as string representations.
Definition data.h:68
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:391
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
Definition data.cpp:326
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:202
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:87
vector< size_t > validation_data_idx
Definition data.h:58
Dataset(const ArrayXXf &X, const Ref< const ArrayXf > &y_=ArrayXf(), const vector< string > &vn={}, const map< string, State > &Z={}, const vector< string > &ft={}, bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:133
ArrayXf y
length N array, the target label
Definition data.h:80
float get_batch_size()
Definition data.cpp:319
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:186
Dataset get_training_data() const
Definition data.cpp:197
void set_batch_size(float new_size)
Definition data.cpp:320
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:139
holds variable type data.
Definition data.h:51
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
namespace containing Data structures used in Brush
Definition data.cpp:49
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:128
DataType StateType(const State &arg)
Definition data.cpp:67
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
Definition data.cpp:71
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
ostream & operator<<(ostream &os, DataType n)
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
DataType
data types.
Definition types.h:143
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34
DataType
data types.
Definition types.h:143
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
auto format(Brush::DataType x, FormatContext &ctx) const
Definition data.h:296