Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
data.h
Go to the documentation of this file.
1/* Brush
2copyright 2020 William La Cava
3license: GNU/GPL v3
4*/
5
6#ifndef DATA_H
7#define DATA_H
8
9// internal includes
10#include "../init.h"
11#include "../util/utils.h"
12#include "../util/error.h"
13#include "../util/logger.h"
14#include "../util/rnd.h"
15#include "timeseries.h"
16//external includes
17#include <variant>
18#include <optional>
19
20namespace Brush
21{
22
23extern map<DataType,string> DataTypeName;
24extern map<string,DataType> DataNameType;
25ostream& operator<<(ostream& os, DataType n);
26
27
28namespace Data
29{
37State check_type(const ArrayXf& x);
38DataType StateType(const State& arg);
39
40template<typename StateRef>
41State cast_type(const ArrayXf& x, const StateRef& x_ref);
42
44
50class Dataset
51{
52 //TODO: make this a json object that allows elements to be fetched by
53 //name
54 //Dataset(ArrayXXf& X, ArrayXf& y, std::map<string,
55 //std::pair<vector<ArrayXf>, vector<ArrayXf>>>& Z): X(X), y(y), Z(Z){}
56 private:
57 vector<size_t> training_data_idx;
58 vector<size_t> validation_data_idx;
59
60 public:
62 std::vector<DataType> unique_data_types;
63
65 std::vector<DataType> feature_types;
66
68 std::unordered_map<DataType,vector<string>> features_of_type;
69
71 std::map<string, State> features;
72 // TODO: this should probably be a more complex type to include feature type
73 // and potentially other info, like arbitrary relations between features
74
76 ArrayXf y;
77
80 std::optional<std::reference_wrapper<const ArrayXXf>> Xref;
81
84 bool use_validation; // TODO: shuffle before validation (this should be a parameter)
85
89
90 Dataset operator()(const vector<size_t>& idx) const;
93 void init();
94
96 map<string,State> make_features(const ArrayXXf& X,
97 const map<string, State>& Z = {},
98 const vector<string>& vn = {}
99 );
100
101 // TODO: let the user specify the datatypes
102
104 map<string,State> copy_and_make_features(const ArrayXXf& X,
105 const Dataset& ref_dataset,
106 const vector<string>& vn = {}
107 );
108
110 Dataset(std::map<string, State>& d,
111 const Ref<const ArrayXf>& y_ = ArrayXf(),
112 bool c = false,
113 float validation_size = 0.0,
114 float batch_size = 1.0
115 )
116 : features(d)
117 , y(y_)
118 , classification(c)
122 , use_batch(batch_size > 0.0 && batch_size < 1.0)
123 {init();};
124
126 Dataset(const ArrayXXf& X,
127 const Ref<const ArrayXf>& y_ = ArrayXf(),
128 const vector<string>& vn = {},
129 const map<string, State>& Z = {},
130 bool c = false,
131 float validation_size = 0.0,
132 float batch_size = 1.0
133 )
135 , y(y_)
140 , use_batch(batch_size > 0.0 && batch_size < 1.0)
141 {
142 init();
144 }
145
147 Dataset(const ArrayXXf& X, const vector<string>& vn,
148 bool c = false,
149 float validation_size = 0.0,
150 float batch_size = 1.0
151 )
153 , features(make_features(X,map<string, State>{},vn))
157 , use_batch(batch_size > 0.0 && batch_size < 1.0)
158 {
159 init();
161 }
162
166 Dataset(const ArrayXXf& X, const Dataset& ref_dataset,
167 const vector<string>& vn,
168 bool c = false
169 )
172 , validation_size(0.0)
174 , batch_size(1.0)
176 {
177 init();
179 }
180
181 void print() const
182 {
183 fmt::print("Dataset contains {} samples and {} features\n",
185 );
186 for (auto& [key, value] : this->features)
187 {
188 if (std::holds_alternative<ArrayXf>(value))
189 fmt::print("{} <ArrayXf>: {}\n", key, std::get<ArrayXf>(value));
190 else if (std::holds_alternative<ArrayXi>(value))
191 fmt::print("{} <ArrayXi>: {}\n", key, std::get<ArrayXi>(value));
192 else if (std::holds_alternative<ArrayXb>(value))
193 fmt::print("{} <ArrayXb>: {}\n", key, std::get<ArrayXb>(value));
194 }
195
196 };
197 auto get_X() const
198 {
199 if (!Xref.has_value())
200 HANDLE_ERROR_THROW("Dataset does not hold a reference to X.");
201 return this->Xref.value().get();
202 }
203
204 // inner partition of original dataset for train and validation.
205 // if split is not set, then training = validation.
208 // TODO: shuffle split
209 inline int get_n_samples() const {
210 return std::visit(
211 [&](auto&& arg) -> int { return int(arg.size());},
212 features.begin()->second
213 );
214 };
215 inline int get_n_features() const { return this->features.size(); };
217 Dataset get_batch() const;
218
219 float get_batch_size();
220 void set_batch_size(float new_size);
221
222 std::array<Dataset, 2> split(const ArrayXb& mask) const;
223
224 State operator[](std::string name) const
225 {
226 if (this->features.find(name) == features.end())
227 HANDLE_ERROR_THROW(fmt::format("Couldn't find feature {} in data\n",name));
228 return this->features.at(name);
229 };
230
231 /* template<> ArrayXb get<ArrayXb>(std::string name) */
232}; // class data
233
234// // read csv
235// Dataset read_csv(const std::string & path, MatrixXf& X, VectorXf& y,
236// vector<string>& names, vector<char> &dtypes, bool& binary_endpoint, char sep) ;
237
238} // data
239
240extern const map<DataType,std::type_index> DataTypeID;
241extern map<std::type_index,DataType> DataIDType;
242
243} // Brush
244
245// format overload for DataType
246template <> struct fmt::formatter<Brush::DataType>: formatter<string_view> {
247 template <typename FormatContext>
249 return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
250 }
251};
252
253// TODO: fmt overload for Data
254// template <> struct fmt::formatter<Brush::Data::Dataset>: formatter<string_view> {
255// template <typename FormatContext>
256// auto format(Brush::Data::Dataset& x, FormatContext& ctx) const {
257 // return formatter<string_view>::format(Brush::DataTypeName.at(x), ctx);
258// }
259// };
260
261#endif
void bind_engine(py::module &m, string name)
holds variable type data.
Definition data.h:51
bool classification
whether this is a classification problem
Definition data.h:79
Dataset get_validation_data() const
Definition data.cpp:174
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:71
int get_n_samples() const
Definition data.h:209
vector< size_t > training_data_idx
Definition data.h:57
auto get_X() const
Definition data.h:197
Dataset get_batch() const
select random subset of data for training weights.
Definition data.cpp:146
std::vector< DataType > feature_types
types of data in the features.
Definition data.h:65
void print() const
Definition data.h:181
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
Definition data.h:68
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
Definition data.h:87
bool use_validation
Definition data.h:84
int get_n_features() const
Definition data.h:215
State operator[](std::string name) const
Definition data.h:224
std::optional< std::reference_wrapper< const ArrayXXf > > Xref
Definition data.h:80
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Definition data.cpp:283
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0)
Definition data.h:110
void init()
call init at the end of constructors to define metafeatures of the data.
Definition data.cpp:178
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
Definition data.h:83
Dataset(const ArrayXXf &X, const vector< string > &vn, bool c=false, float validation_size=0.0, float batch_size=1.0)
Definition data.h:147
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={})
turns input data into a feature map
Definition data.cpp:238
vector< size_t > validation_data_idx
Definition data.h:58
Dataset(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn, bool c=false)
Definition data.h:166
Dataset(const ArrayXXf &X, const Ref< const ArrayXf > &y_=ArrayXf(), const vector< string > &vn={}, const map< string, State > &Z={}, bool c=false, float validation_size=0.0, float batch_size=1.0)
Definition data.h:126
ArrayXf y
length N array, the target label
Definition data.h:76
float get_batch_size()
Definition data.cpp:231
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Definition data.cpp:162
Dataset get_training_data() const
Definition data.cpp:173
void set_batch_size(float new_size)
Definition data.cpp:232
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
Definition data.cpp:117
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
State check_type(const ArrayXf &x)
determines data types of columns of matrix X.
Definition data.cpp:68
State cast_type(const ArrayXf &x, const StateRef &x_ref)
Definition data.cpp:106
DataType StateType(const State &arg)
Definition data.cpp:64
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
Definition types.h:140
< nsga2 selection operator for getting the front
Definition data.cpp:12
ostream & operator<<(ostream &os, DataType n)
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
DataType
data types.
Definition types.h:143
map< DataType, string > DataTypeName
Definition data.cpp:14
const map< DataType, std::type_index > DataTypeID
Definition data.cpp:36
map< std::type_index, DataType > DataIDType
Definition data.cpp:47
map< string, DataType > DataNameType
Definition data.cpp:34
auto format(Brush::DataType x, FormatContext &ctx) const
Definition data.h:248