Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.h
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6#ifndef DATA_H
7#define DATA_H
8
9// internal includes
10#include "../init.h"
11#include "../util/utils.h"
12#include "../util/error.h"
13#include "../util/logger.h"
14#include "../util/rnd.h"
15#include "timeseries.h"
16//external includes
17#include <variant>
18#include <optional>
19
20namespace Brush
21{
22
23extern map<DataType,string> DataTypeName;
24extern map<string,DataType> DataNameType;
25ostream& operator<<(ostream& os, DataType n);
26
27
28namespace Data
29{
34
35
37State check_type(const ArrayXf& x, const string t);
38DataType StateType(const State& arg);
39
40template<typename StateRef>
41State cast_type(const ArrayXf& x, const StateRef& x_ref);
42
44
50class Dataset
51{
52 //TODO: make this a json object that allows elements to be fetched by
53 //name
54 //Dataset(ArrayXXf& X, ArrayXf& y, std::map<string,
55 //std::pair<vector<ArrayXf>, vector<ArrayXf>>>& Z): X(X), y(y), Z(Z){}
56 private:
57 vector<size_t> training_data_idx;
58 vector<size_t> validation_data_idx;
59
60 public:
62 std::vector<DataType> unique_data_types;
63
65 std::vector<DataType> feature_types;
66
68 std::vector<string> feature_names; // TODO: remove?
69
71 std::unordered_map<DataType,vector<string>> features_of_type;
72
74 std::map<string, State> features;
75
76 // TODO: this should probably be a more complex type to include feature type
77 // and potentially other info, like arbitrary relations between features
78
80 ArrayXf y;
81
84 std::optional<std::reference_wrapper<const ArrayXXf>> Xref;
85
90
94
95 Dataset operator()(const vector<size_t>& idx) const;
96
99 void init();
100
102 map<string,State> make_features(const ArrayXXf& X,
103 const map<string, State>& Z = {},
104 const vector<string>& vn = {},
105 const vector<string>& ft = {}
106 );
107
109 map<string,State> copy_and_make_features(const ArrayXXf& X,
110 const Dataset& ref_dataset,
111 const vector<string>& vn = {}
112 );
113
115 Dataset(std::map<string, State>& d,
116 const Ref<const ArrayXf>& y_ = ArrayXf(),
117 bool c = false,
118 float validation_size = 0.0,
119 float batch_size = 1.0,
120 bool shuffle_split = false
121 )
122 : features(d)
123 , y(y_)
124 , classification(c)
128 , use_batch(batch_size > 0.0 && batch_size < 1.0)
130 {init();};
131
133 Dataset(const ArrayXXf& X,
134 const Ref<const ArrayXf>& y_ = ArrayXf(),
135 const vector<string>& vn = {},
136 const map<string, State>& Z = {},
137 const vector<string>& ft = {},
138 bool c = false,
139 float validation_size = 0.0,
140 float batch_size = 1.0,
141 bool shuffle_split = false
142 )
143 : features(make_features(X,Z,vn,ft))
144 , y(y_)
145 , classification(c)
149 , use_batch(batch_size > 0.0 && batch_size < 1.0)
151 {
152 init();
153 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
154 }
155
157 Dataset(const ArrayXXf& X, const vector<string>& vn,
158 const vector<string>& ft = {},
159 bool c = false,
160 float validation_size = 0.0,
161 float batch_size = 1.0,
162 bool shuffle_split = false
163 )
164 : classification(c)
165 , features(make_features(X,map<string, State>{},vn,ft))
169 , use_batch(batch_size > 0.0 && batch_size < 1.0)
171 {
172 init();
173 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
174 }
175
179 Dataset(const ArrayXXf& X, const Dataset& ref_dataset,
180 const vector<string>& vn
181 )
182 : classification(ref_dataset.classification)
183 , features(copy_and_make_features(X,ref_dataset,vn))
184 , validation_size(0.0)
185 , use_validation(false)
186 , batch_size(1.0)
187 , use_batch(false)
188 , shuffle_split(false)
189 {
190 init();
191 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
192 }
193
194 void print() const
195 {
196 fmt::print("Dataset contains {} samples and {} features\n",
198 );
199 for (auto& [key, value] : this->features)
200 {
201 if (std::holds_alternative<ArrayXf>(value))
202 fmt::print("{} <ArrayXf>: {}\n", key, std::get<ArrayXf>(value));
203 else if (std::holds_alternative<ArrayXi>(value))
204 fmt::print("{} <ArrayXi>: {}\n", key, std::get<ArrayXi>(value));
205 else if (std::holds_alternative<ArrayXb>(value))
206 fmt::print("{} <ArrayXb>: {}\n", key, std::get<ArrayXb>(value));
207 }
208
209 };
210 auto get_X() const
211 {
212 if (!Xref.has_value())
213 HANDLE_ERROR_THROW("Dataset does not hold a reference to X.");
214 return this->Xref.value().get();
215 }
216
217 // inner partition of original dataset for train and validation.
218 // if split is not set, then training = validation.
221
222 inline int get_n_samples() const {
223 return std::visit(
224 [&](auto&& arg) -> int { return int(arg.size());},
225 features.begin()->second
226 );
227 };
228 inline int get_n_features() const { return this->features.size(); };
230 Dataset get_batch() const;
231
232 float get_batch_size();
233 void set_batch_size(float new_size);
234
235 std::array<Dataset, 2> split(const ArrayXb& mask) const;
236
237 State operator[](std::string name) const
238 {
239 if (this->features.find(name) == features.end())
240 HANDLE_ERROR_THROW(fmt::format("Couldn't find feature {} in data\n",name));
241 return this->features.at(name);
242 };
243
244 /* template<> ArrayXb get<ArrayXb>(std::string name) */
245}; // class data
246
247// TODO: serialization of features in order to nlohmann to work
248// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Dataset,
249// features,
250// y,
251// classification,
252// validation_size,
253// use_validation,
254// batch_size,
255// use_batch,
256// shuffle_split
257// );
258
259// // read csv
260// Dataset read_csv(const std::string & path, MatrixXf& X, VectorXf& y,
261// vector<string>& names, vector<char> &dtypes, bool& binary_endpoint, char sep) ;
262
263} // data
264
265extern const map<DataType,std::type_index> DataTypeID;
266extern map<std::type_index,DataType> DataIDType;
267
268} // Brush
269
270// format overload for DataType
271template <> struct fmt::formatter<Brush::DataType>: formatter<string_view> {
272 template <typename FormatContext>
273 auto format(Brush::DataType x, FormatContext& ctx) const {
274 return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
275 }
276};
277
278// TODO: fmt overload for Data
279// template <> struct fmt::formatter<Brush::Data::Dataset>: formatter<string_view> {
280// template <typename FormatContext>
281// auto format(Brush::Data::Dataset& x, FormatContext& ctx) const {
282 // return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
283// }
284// };
285
286#endif
holds variable type data.
Definition data.h:51
Dataset(const ArrayXXf &X, const vector< string > &vn, const vector< string > &ft={}, bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:157
bool classification
whether this is a classification problem
Definition data.h:83
Dataset get_validation_data() const
Definition data.cpp:199
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:74
int get_n_samples() const
Definition data.h:222
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:115
vector< size_t > training_data_idx
Definition data.h:57
auto get_X() const
Definition data.h:210
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:171
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:65
void print() const
Definition data.h:194
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:71
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:92
bool use_validation
Definition data.h:88
int get_n_features() const
Definition data.h:228
Dataset(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn)
Definition data.h:179
State operator[](std::string name) const
Definition data.h:237
std::optional< std::reference_wrapper< const ArrayXXf > > Xref
Definition data.h:84
std::vector< string > feature_names
names of the feature types as string representations.
Definition data.h:68
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:391
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
Definition data.cpp:327
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:203
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:87
vector< size_t > validation_data_idx
Definition data.h:58
Dataset(const ArrayXXf &X, const Ref< const ArrayXf > &y_=ArrayXf(), const vector< string > &vn={}, const map< string, State > &Z={}, const vector< string > &ft={}, bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:133
ArrayXf y
length N array, the target label
Definition data.h:80
float get_batch_size()
Definition data.cpp:320
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:187
Dataset get_training_data() const
Definition data.cpp:198
void set_batch_size(float new_size)
Definition data.cpp:321
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:140
holds variable type data.
Definition data.h:51
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
namespace containing Data structures used in Brush
Definition data.cpp:49
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:129
DataType StateType(const State &arg)
Definition data.cpp:67
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
Definition data.cpp:71
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
ostream & operator<<(ostream &os, DataType n)
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
DataType
data types.
Definition types.h:143
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34
DataType
data types.
Definition types.h:143
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
auto format(Brush::DataType x, FormatContext &ctx) const
Definition data.h:273