85 tmp = ArrayXf(x.cast<
float>());
88 "Invalid feature type. check_type does not support this type: " + t);
94 bool isCategorical =
true;
96 std::map<float, bool> uniqueMap;
97 for(
int i = 0; i < x.size(); i++)
99 if(x(i) != 0 && x(i) != 1)
101 if(x(i) != floor(x(i)) && x(i) != ceil(x(i)))
102 isCategorical =
false;
104 uniqueMap[x(i)] =
true;
113 if(isCategorical && uniqueMap.size() <= 10)
127template<
typename StateRef>
130 if (std::holds_alternative<ArrayXi>(x_ref))
132 else if (std::holds_alternative<ArrayXb>(x_ref))
133 return ArrayXb(x.cast<
bool>());
141 std::map<std::string, State> new_features;
142 for (
auto& [key, value] : this->
features)
145 std::visit([&](
auto&& arg)
147 using T = std::decay_t<
decltype(arg)>;
148 if constexpr ( T::NumDimensions == 1)
149 new_features[k] = T(arg(idx));
150 else if constexpr (T::NumDimensions==2)
151 new_features[k] = T(arg(idx, Eigen::all));
159 if (this->
y.size()>0)
161 new_y = this->
y(idx);
183 return (*
this)(
r.shuffled_index(n_samples));
194 return std::array<Dataset, 2>{ (*this)(idx1), (*
this)(idx2) };
211 fmt::format(
"Error during the initialization of the dataset. It "
212 "does not contain any data\n")
217 for (
const auto& [name, value]: this->
features)
238 vector<size_t> idx(n_samples);
240 std::iota(idx.begin(), idx.end(), 0);
242 std::transform(idx.begin(), idx.end(),
244 [&](
int element) {
return element; });
246 std::transform(idx.begin(), idx.end(),
248 [&](
int element) {
return element; });
252 std::map<float, vector<int>> class_indices;
253 for (
size_t i = 0; i < n_samples; ++i) {
254 class_indices[
y[i]].push_back(i);
257 for (
auto& class_group : class_indices) {
258 auto& indices = class_group.second;
260 int n_class_samples = indices.size();
262 vector<size_t> idx(n_class_samples);
264 idx =
r.shuffled_index(n_class_samples);
266 std::iota(idx.begin(), idx.end(), 0);
268 auto n_train_samples = int(ceil(n_class_samples*(1.0-
validation_size)));
270 std::transform(idx.begin(), idx.begin() + n_train_samples,
272 [&](
int element) {
return indices[element]; });
274 if (n_class_samples - n_train_samples == 0)
277 std::transform(idx.begin(), idx.begin() + n_train_samples,
279 [&](
int element) {
return indices[element]; });
283 std::transform(idx.begin() + n_train_samples, idx.end(),
285 [&](
int element) {
return indices[element]; });
291 vector<size_t> idx(n_samples);
294 idx =
r.shuffled_index(n_samples);
296 std::iota(idx.begin(), idx.end(), 0);
303 std::transform(idx.begin(), idx.begin() + n_train_samples,
305 [&](
int element) {
return element; });
307 if (n_samples - n_train_samples == 0) {
312 std::transform(idx.begin() + n_train_samples, idx.end(),
314 [&](
int element) {
return element; });
327 const map<string,State>& Z,
328 const vector<string>& vn,
329 const vector<string>& ft
333 map<string, State> tmp_features;
342 for (
int i = 0; i < X.cols(); ++i)
350 if (vn.size() != X.cols())
352 fmt::format(
"Variable names and data size mismatch: "
353 "{} variable names and {} features in X",
354 vn.size(), X.cols()) );
359 vector<string> var_types;
362 for (
int i = 0; i < X.cols(); ++i)
364 var_types.push_back(
"");
368 if (ft.size() != X.cols())
370 fmt::format(
"Feature type names and data size mismatch: "
371 "{} feature type names and {} features in X",
372 ft.size(), X.cols()) );
377 for (
int i = 0; i < X.cols(); ++i)
385 tmp_features.insert(Z.begin(), Z.end());
393 const vector<string>& vn
399 for (
int i = 0; i < X.cols(); ++i)
407 if (vn.size() != X.cols())
409 fmt::format(
"Variable names and data size mismatch: "
410 "{} variable names and {} features in X",
420 fmt::format(
"Reference dataset with incompatible number of variables: "
421 "Reference has {} variable names, but X has {}",
427 map<string, State> tmp_features;
428 for (
int i = 0; i < X.cols(); ++i)
bool classification
whether this is a classification problem
Dataset get_validation_data() const
std::map< string, State > features
dataset features, as key value pairs
int get_n_samples() const
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
vector< size_t > training_data_idx
Dataset get_batch() const
select random subset of data for training weights.
std::vector< DataType > feature_types
types of data in the features.
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
std::vector< string > feature_names
names of the feature types as string representations.
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
void init()
call init at the end of constructors to define metafeatures of the data.
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
vector< size_t > validation_data_idx
ArrayXf y
length N array, the target label
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Dataset get_training_data() const
void set_batch_size(float new_size)
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
#define HANDLE_ERROR_THROW(err)
namespace containing Data structures used in Brush
std::vector< DataType > StateTypes
TimeSeries< bool > TimeSeriesb
TimeSeries convenience typedefs.
State cast_type(const ArrayXf &x, const StateRef &x_ref)
TimeSeries< float > TimeSeriesf
DataType StateType(const State &arg)
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
ostream & operator<<(ostream &os, DataType dt)
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
TimeSeries< int > TimeSeriesi
namespace containing various utility functions
static map< V, K > reverse_map(const map< K, V > &m)
Given a map from keys to values, creates a new map from values to keys.
string to_string(const T &value)
template function to convert objects to string for logging
void unique_insert(Vector &v, const T &t)
unique insertion into a vector. allows a vector to be used like a set. source: http://www....
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
< nsga2 selection operator for getting the front
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Eigen::Array< int, Eigen::Dynamic, Eigen::Dynamic > ArrayXXi
constexpr bool always_false_v
Eigen::Array< bool, Eigen::Dynamic, Eigen::Dynamic > ArrayXXb
map< DataType, string > DataTypeName
const map< DataType, std::type_index > DataTypeID
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
map< std::type_index, DataType > DataIDType
map< string, DataType > DataNameType