85 tmp = ArrayXf(x.cast<
float>());
88 "Invalid feature type. check_type does not support this type: " + t);
94 bool isCategorical =
true;
96 std::map<float, bool> uniqueMap;
97 for(
int i = 0; i < x.size(); i++)
99 if(x(i) != 0 && x(i) != 1)
101 if(x(i) != floor(x(i)) && x(i) != ceil(x(i)))
102 isCategorical =
false;
104 uniqueMap[x(i)] =
true;
113 if(isCategorical && uniqueMap.size() <= 10)
127template<
typename StateRef>
130 if (std::holds_alternative<ArrayXi>(x_ref))
132 else if (std::holds_alternative<ArrayXb>(x_ref))
133 return ArrayXb(x.cast<
bool>());
141 std::map<std::string, State> new_features;
142 for (
auto& [key, value] : this->
features)
145 std::visit([&](
auto&& arg)
147 using T = std::decay_t<
decltype(arg)>;
148 if constexpr ( T::NumDimensions == 1)
149 new_features[k] = T(arg(idx));
150 else if constexpr (T::NumDimensions==2)
151 new_features[k] = T(arg(idx, Eigen::all));
159 if (this->
y.size()>0)
161 new_y = this->
y(idx);
200 return (*
this)(
r.shuffled_index(n_samples));
211 return std::array<Dataset, 2>{ (*this)(idx1), (*
this)(idx2) };
222 vector<string> python_feature_types;
227 const auto& value = this->
features.at(name);
233 python_feature_types.push_back(
"ArrayB");
235 python_feature_types.push_back(
"ArrayI");
237 python_feature_types.push_back(
"ArrayF");
240 "get_feature_type does not support the type of this feature yet: " + name +
241 "as a notice, this function is suposed to be used in the python side, to extract data types inferred by Brush type sniffer.");
244 return python_feature_types;
259 fmt::format(
"Error during the initialization of the dataset. It "
260 "does not contain any data\n")
268 vector<string> names_to_use;
270 for (
const auto& [name, value] : this->
features) {
271 names_to_use.push_back(name);
277 for (
const auto& name : names_to_use)
280 const auto& value = this->
features.at(name);
303 vector<size_t> idx(n_samples);
305 std::iota(idx.begin(), idx.end(), 0);
307 std::transform(idx.begin(), idx.end(),
309 [&](
int element) {
return element; });
311 std::transform(idx.begin(), idx.end(),
313 [&](
int element) {
return element; });
317 std::map<float, vector<int>> class_indices;
318 for (
size_t i = 0; i < n_samples; ++i) {
319 class_indices[
y[i]].push_back(i);
322 for (
auto& class_group : class_indices) {
323 auto& indices = class_group.second;
325 int n_class_samples = indices.size();
327 vector<size_t> idx(n_class_samples);
329 idx =
r.shuffled_index(n_class_samples);
331 std::iota(idx.begin(), idx.end(), 0);
333 auto n_train_samples = int(ceil(n_class_samples*(1.0-
validation_size)));
335 std::transform(idx.begin(), idx.begin() + n_train_samples,
337 [&](
int element) {
return indices[element]; });
339 if (n_class_samples - n_train_samples == 0)
342 std::transform(idx.begin(), idx.begin() + n_train_samples,
344 [&](
int element) {
return indices[element]; });
348 std::transform(idx.begin() + n_train_samples, idx.end(),
350 [&](
int element) {
return indices[element]; });
356 vector<size_t> idx(n_samples);
359 idx =
r.shuffled_index(n_samples);
361 std::iota(idx.begin(), idx.end(), 0);
368 std::transform(idx.begin(), idx.begin() + n_train_samples,
370 [&](
int element) {
return element; });
372 if (n_samples - n_train_samples == 0) {
377 std::transform(idx.begin() + n_train_samples, idx.end(),
379 [&](
int element) {
return element; });
392 const map<string,State>& Z,
393 const vector<string>& vn,
394 const vector<string>& ft
398 map<string, State> tmp_features;
403 vector<string> tmp_feature_names = {};
407 for (
int i = 0; i < X.cols(); ++i)
410 tmp_feature_names.push_back(v);
415 if (vn.size() != X.cols())
417 fmt::format(
"Variable names and data size mismatch: "
418 "{} variable names and {} features in X",
419 vn.size(), X.cols()) );
420 tmp_feature_names = vn;
424 vector<string> var_types;
427 for (
int i = 0; i < X.cols(); ++i)
429 var_types.push_back(
"");
433 if (ft.size() != X.cols())
435 fmt::format(
"Feature type names and data size mismatch: "
436 "{} feature type names and {} features in X",
437 ft.size(), X.cols()) );
442 for (
int i = 0; i < X.cols(); ++i)
447 tmp_features[tmp_feature_names.at(i)] = tmp;
450 tmp_features.insert(Z.begin(), Z.end());
454 for (
const auto& [name, value] : Z) {
464 const vector<string>& vn
467 vector<string> tmp_feature_names = {};
470 for (
int i = 0; i < X.cols(); ++i)
473 tmp_feature_names.push_back(v);
478 if (vn.size() != X.cols())
480 fmt::format(
"Variable names and data size mismatch: "
481 "{} variable names and {} features in X",
486 tmp_feature_names = vn;
489 if (ref_dataset.
features.size() != tmp_feature_names.size())
491 fmt::format(
"Reference dataset with incompatible number of variables: "
492 "Reference has {} variable names, but X has {}",
494 tmp_feature_names.size()
498 map<string, State> tmp_features;
499 for (
int i = 0; i < X.cols(); ++i)
503 ref_dataset.
features.at(tmp_feature_names.at(i))
506 tmp_features[tmp_feature_names.at(i)] = tmp;
bool classification
whether this is a classification problem
vector< string > feature_name_order_
stores the original feature name order before map sorting
Dataset get_validation_data() const
std::map< string, State > features
dataset features, as key value pairs
int get_n_samples() const
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0, bool shuffle_split=false)
vector< size_t > training_data_idx
Dataset get_batch() const
select random subset of data for training weights.
vector< string > get_feature_types() const
std::vector< DataType > feature_types
types of data in the features.
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
std::vector< string > feature_names
names of the feature types as string representations.
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={}, const vector< string > &ft={})
turns input data into a feature map
void init()
call init at the end of constructors to define metafeatures of the data.
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
vector< size_t > validation_data_idx
ArrayXf y
length N array, the target label
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Dataset get_training_data() const
void set_batch_size(float new_size)
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
#define HANDLE_ERROR_THROW(err)
namespace containing Data structures used in Brush
std::vector< DataType > StateTypes
TimeSeries< bool > TimeSeriesb
TimeSeries convenience typedefs.
State cast_type(const ArrayXf &x, const StateRef &x_ref)
TimeSeries< float > TimeSeriesf
DataType StateType(const State &arg)
State check_type(const ArrayXf &x, const string t)
determines data types of columns of matrix X.
ostream & operator<<(ostream &os, DataType dt)
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
TimeSeries< int > TimeSeriesi
namespace containing various utility functions
static map< V, K > reverse_map(const map< K, V > &m)
Given a map from keys to values, creates a new map from values to keys.
string to_string(const T &value)
template function to convert objects to string for logging
void unique_insert(Vector &v, const T &t)
unique insertion into a vector. allows a vector to be used like a set. source: http://www....
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
< nsga2 selection operator for getting the front
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Eigen::Array< int, Eigen::Dynamic, Eigen::Dynamic > ArrayXXi
constexpr bool always_false_v
Eigen::Array< bool, Eigen::Dynamic, Eigen::Dynamic > ArrayXXb
map< DataType, string > DataTypeName
const map< DataType, std::type_index > DataTypeID
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
map< std::type_index, DataType > DataIDType
map< string, DataType > DataNameType