Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.h
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6#ifndef DATA_H
7#define DATA_H
8
9// internal includes
10#include "../init.h"
11#include "../util/utils.h"
12#include "../util/error.h"
13#include "../util/logger.h"
14#include "../util/rnd.h"
15#include "timeseries.h"
16//external includes
17#include <variant>
18#include <optional>
19
20namespace Brush
21{
22
23extern map<DataType,string> DataTypeName;
24extern map<string,DataType> DataNameType;
25ostream& operator<<(ostream& os, DataType n);
26
27
28namespace Data
29{
34
35
37State check_type(const ArrayXf& x, const string t);
38DataType StateType(const State& arg);
39
40template<typename StateRef>
41State cast_type(const ArrayXf& x, const StateRef& x_ref);
42
44
50class Dataset
51{
52 //TODO: make this a json object that allows elements to be fetched by
53 //name
54 //Dataset(ArrayXXf& X, ArrayXf& y, std::map<string,
55 //std::pair<vector<ArrayXf>, vector<ArrayXf>>>& Z): X(X), y(y), Z(Z){}
56 private:
57 vector<size_t> training_data_idx;
58 vector<size_t> validation_data_idx;
60 vector<string> feature_name_order_;
61
62 public:
64 std::vector<DataType> unique_data_types;
65
67 std::vector<DataType> feature_types;
68
70 std::vector<string> feature_names;
71
73 std::unordered_map<DataType,vector<string>> features_of_type;
74
76 std::map<string, State> features;
77
78 // TODO: this should probably be a more complex type to include feature type
79 // and potentially other info, like arbitrary relations between features
80
82 ArrayXf y;
83
86 std::optional<std::reference_wrapper<const ArrayXXf>> Xref;
87
92
96
97 Dataset operator()(const vector<size_t>& idx) const;
98
101 void init();
102
104 map<string,State> make_features(const ArrayXXf& X,
105 const map<string, State>& Z = {},
106 const vector<string>& vn = {},
107 const vector<string>& ft = {}
108 );
109
111 map<string,State> copy_and_make_features(const ArrayXXf& X,
112 const Dataset& ref_dataset,
113 const vector<string>& vn = {}
114 );
115
117 Dataset(std::map<string, State>& d,
118 const Ref<const ArrayXf>& y_ = ArrayXf(),
119 bool c = false,
120 float validation_size = 0.0,
121 float batch_size = 1.0,
122 bool shuffle_split = false
123 )
124 : features(d)
125 , y(y_)
126 , classification(c)
130 , use_batch(batch_size > 0.0 && batch_size < 1.0)
132 {init();};
133
135 Dataset(const ArrayXXf& X,
136 const Ref<const ArrayXf>& y_ = ArrayXf(),
137 const vector<string>& vn = {},
138 const map<string, State>& Z = {},
139 const vector<string>& ft = {},
140 bool c = false,
141 float validation_size = 0.0,
142 float batch_size = 1.0,
143 bool shuffle_split = false
144 )
145 : features(make_features(X,Z,vn,ft))
146 , y(y_)
147 , classification(c)
151 , use_batch(batch_size > 0.0 && batch_size < 1.0)
153 {
154 init();
155 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
156 }
157
159 Dataset(const ArrayXXf& X, const vector<string>& vn,
160 const vector<string>& ft = {},
161 bool c = false,
162 float validation_size = 0.0,
163 float batch_size = 1.0,
164 bool shuffle_split = false
165 )
166 : classification(c)
167 , features(make_features(X,map<string, State>{},vn,ft))
171 , use_batch(batch_size > 0.0 && batch_size < 1.0)
173 {
174 init();
175 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
176 }
177
181 Dataset(const ArrayXXf& X, const Dataset& ref_dataset,
182 const vector<string>& vn
183 )
184 : classification(ref_dataset.classification)
185 , features(copy_and_make_features(X,ref_dataset,vn))
186 , validation_size(0.0)
187 , use_validation(false)
188 , batch_size(1.0)
189 , use_batch(false)
190 , shuffle_split(false)
191 {
192 init();
193 Xref = optional<reference_wrapper<const ArrayXXf>>{X};
194 }
195
196 void print() const
197 {
198 fmt::print("Dataset contains {} samples and {} features\n",
200 );
201 for (auto& [key, value] : this->features)
202 {
203 if (std::holds_alternative<ArrayXf>(value))
204 fmt::print("{} <ArrayXf>: {}\n", key, std::get<ArrayXf>(value));
205 else if (std::holds_alternative<ArrayXi>(value))
206 fmt::print("{} <ArrayXi>: {}\n", key, std::get<ArrayXi>(value));
207 else if (std::holds_alternative<ArrayXb>(value))
208 fmt::print("{} <ArrayXb>: {}\n", key, std::get<ArrayXb>(value));
209 }
210
211 };
212 auto get_X() const
213 {
214 if (!Xref.has_value())
215 HANDLE_ERROR_THROW("Dataset does not hold a reference to X.");
216 return this->Xref.value().get();
217 }
218
219 // inner partition of original dataset for train and validation.
220 // if split is not set, then training = validation.
223 vector<string> get_feature_types() const;
224
225 inline int get_n_samples() const {
226 return std::visit(
227 [&](auto&& arg) -> int { return int(arg.size());},
228 features.begin()->second
229 );
230 };
231 inline int get_n_features() const { return this->features.size(); };
233 Dataset get_batch() const;
234
235 float get_batch_size();
236 void set_batch_size(float new_size);
237
238 DataType get_feature_type(const string& name) const
239 {
240 DataType feature_type = DataType::ArrayF;
241
242 bool has_feature = false;
243 for (auto& [ftype, names] : features_of_type){
244 auto it = std::find(names.begin(), names.end(), name);
245
246 if (it != names.end())
247 {
248 feature_type = ftype;
249 has_feature = true;
250 break;
251 }
252 }
253
254 if (!has_feature)
255 HANDLE_ERROR_THROW(fmt::format("Couldn't find feature {} in data\n",name));
256
257 return feature_type;
258 };
259
260 std::array<Dataset, 2> split(const ArrayXb& mask) const;
261
262 State operator[](std::string name) const
263 {
264 if (this->features.find(name) == features.end())
265 HANDLE_ERROR_THROW(fmt::format("Couldn't find feature {} in data\n",name));
266 return this->features.at(name);
267 };
268
269 /* template<> ArrayXb get<ArrayXb>(std::string name) */
270}; // class data
271
272// TODO: serialization of features in order to nlohmann to work
273// NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Dataset,
274// features,
275// y,
276// classification,
277// validation_size,
278// use_validation,
279// batch_size,
280// use_batch,
281// shuffle_split
282// );
283
284// // read csv
285// Dataset read_csv(const std::string & path, MatrixXf& X, VectorXf& y,
286// vector<string>& names, vector<char> &dtypes, bool& binary_endpoint, char sep) ;
287
288} // data
289
290extern const map<DataType,std::type_index> DataTypeID;
291extern map<std::type_index,DataType> DataIDType;
292
293} // Brush
294
295// format overload for DataType
296template <> struct fmt::formatter<Brush::DataType>: formatter<string_view> {
297 template <typename FormatContext>
298 auto format(Brush::DataType x, FormatContext& ctx) const {
299 return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
300 }
301};
302
303// TODO: fmt overload for Data
304// template <> struct fmt::formatter<Brush::Data::Dataset>: formatter<string_view> {
305// template <typename FormatContext>
306// auto format(Brush::Data::Dataset& x, FormatContext& ctx) const {
307 // return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
308// }
309// };
310
311#endif
holds variable type data.
Definition data.h:51
Dataset(const ArrayXXf &X, const vector< string > &vn, const vector< string > &ft={}, bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:159
bool classification
whether this is a classification problem
Definition data.h:85
vector< string > feature_name_order_
stores the original feature name order before map sorting
Definition data.h:60
Dataset get_validation_data() const
Definition data.cpp:215
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:76
DataType get_feature_type(const string &name) const
Definition data.h:238
int get_n_samples() const
Definition data.h:225
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:117
vector< size_t > training_data_idx
Definition data.h:57
auto get_X() const
Definition data.h:212
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:187
vector< string > get_feature_types() const
Definition data.cpp:217
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:67
void print() const
Definition data.h:196
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:73
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:94
bool use_validation
Definition data.h:90
int get_n_features() const
Definition data.h:231
Dataset(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn)
Definition data.h:181
State operator[](std::string name) const
Definition data.h:262
std::optional< std::reference_wrapper< const ArrayXXf > > Xref
Definition data.h:86
std::vector< string > feature_names
names of the feature types as string representations.
Definition data.h:70
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:64
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:462
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
Definition data.cpp:391
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:250
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:89
vector< size_t > validation_data_idx
Definition data.h:58
Dataset(const ArrayXXf &X, const Ref< const ArrayXf > &y_=ArrayXf(), const vector< string > &vn={}, const map< string, State > &Z={}, const vector< string > &ft={}, bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
Definition data.h:135
ArrayXf y
length N array, the target label
Definition data.h:82
float get_batch_size()
Definition data.cpp:384
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:203
Dataset get_training_data() const
Definition data.cpp:214
void set_batch_size(float new_size)
Definition data.cpp:385
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:139
holds variable type data.
Definition data.h:51
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
namespace containing Data structures used in Brush
Definition data.cpp:49
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:128
DataType StateType(const State &arg)
Definition data.cpp:67
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
Definition data.cpp:71
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
ostream & operator<<(ostream &os, DataType n)
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
DataType
data types.
Definition types.h:143
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34
DataType
data types.
Definition types.h:143
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
auto format(Brush::DataType x, FormatContext &ctx) const
Definition data.h:298