72 bool isCategorical =
true;
73 std::map<float, bool> uniqueMap;
74 for(
int i = 0; i < x.size(); i++)
77 if(x(i) != 0 && x(i) != 1)
79 if(x(i) != floor(x(i)) && x(i) != ceil(x(i)))
80 isCategorical =
false;
82 uniqueMap[x(i)] =
true;
93 if(isCategorical && uniqueMap.size() <= 10)
105template<
typename StateRef>
108 if (std::holds_alternative<ArrayXi>(x_ref))
110 else if (std::holds_alternative<ArrayXb>(x_ref))
111 return ArrayXb(x.cast<
bool>());
119 std::map<std::string, State> new_features;
120 for (
auto& [key, value] : this->
features)
122 std::visit([&](
auto&& arg)
124 using T = std::decay_t<
decltype(arg)>;
125 if constexpr ( T::NumDimensions == 1)
126 new_features[key] = T(arg(idx));
127 else if constexpr (T::NumDimensions==2)
128 new_features[key] = T(arg(idx, Eigen::all));
136 if (this->
y.size()>0)
138 new_y = this->
y(idx);
159 return (*
this)(
r.shuffled_index(n_samples));
170 return std::array<Dataset, 2>{ (*this)(idx1), (*
this)(idx2) };
187 fmt::format(
"Error during the initialization of the dataset. It "
188 "does not contain any data\n")
193 for (
const auto& [name, value]: this->
features)
207 auto idx =
r.shuffled_index(n_samples);
215 std::transform(idx.begin(), idx.begin() + n_train_samples,
217 [&](
int element) {
return element; });
221 std::transform(idx.begin() + n_train_samples, idx.end(),
223 [&](
int element) {
return element; });
239 const map<string,State>& Z,
240 const vector<string>& vn
244 map<string, State> tmp_features;
245 vector<string> var_names;
251 for (
int i = 0; i < X.cols(); ++i)
254 var_names.push_back(v);
259 if (vn.size() != X.cols())
261 fmt::format(
"Variable names and data size mismatch: "
262 "{} variable names and {} features in X",
270 for (
int i = 0; i < X.cols(); ++i)
275 tmp_features[var_names.at(i)] = tmp;
278 tmp_features.insert(Z.begin(), Z.end());
285 const vector<string>& vn
288 vector<string> var_names;
291 for (
int i = 0; i < X.cols(); ++i)
294 var_names.push_back(v);
299 if (vn.size() != X.cols())
301 fmt::format(
"Variable names and data size mismatch: "
302 "{} variable names and {} features in X",
310 if (ref_dataset.
features.size() != var_names.size())
312 fmt::format(
"Reference dataset with incompatible number of variables: "
313 "Reference has {} variable names, but X has {}",
319 map<string, State> tmp_features;
320 for (
int i = 0; i < X.cols(); ++i)
324 ref_dataset.
features.at(var_names.at(i))
327 tmp_features[var_names.at(i)] = tmp;
bool classification
whether this is a classification problem
Dataset get_validation_data() const
std::map< string, State > features
dataset features, as key value pairs
int get_n_samples() const
vector< size_t > training_data_idx
Dataset get_batch() const
select random subset of data for training weights.
std::vector< DataType > feature_types
types of data in the features.
std::unordered_map< DataType, vector< string > > features_of_type
map from data types to features having that type.
float batch_size
percentage of training data size to use in each batch. if 1.0, then all data is used
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
map< string, State > copy_and_make_features(const ArrayXXf &X, const Dataset &ref_dataset, const vector< string > &vn={})
turns input into a feature map, with feature types copied from a reference
Dataset(std::map< string, State > &d, const Ref< const ArrayXf > &y_=ArrayXf(), bool c=false, float validation_size=0.0, float batch_size=1.0)
void init()
call init at the end of constructors to define metafeatures of the data.
float validation_size
percentage of original data used for train. if 0.0, then all data is used for train and validation
map< string, State > make_features(const ArrayXXf &X, const map< string, State > &Z={}, const vector< string > &vn={})
turns input data into a feature map
vector< size_t > validation_data_idx
ArrayXf y
length N array, the target label
std::array< Dataset, 2 > split(const ArrayXb &mask) const
Dataset get_training_data() const
void set_batch_size(float new_size)
Dataset operator()(const vector< size_t > &idx) const
return a slice of the data using indices idx
#define HANDLE_ERROR_THROW(err)
namespace containing Data structures used in Brush
std::vector< DataType > StateTypes
State check_type(const ArrayXf &x)
determines data types of columns of matrix X.
TimeSeries< bool > TimeSeriesb
TimeSeries convenience typedefs.
State cast_type(const ArrayXf &x, const StateRef &x_ref)
TimeSeries< float > TimeSeriesf
DataType StateType(const State &arg)
ostream & operator<<(ostream &os, DataType dt)
std::variant< ArrayXb, ArrayXi, ArrayXf, ArrayXXb, ArrayXXi, ArrayXXf, TimeSeriesb, TimeSeriesi, TimeSeriesf, ArrayXbJet, ArrayXiJet, ArrayXfJet, ArrayXXbJet, ArrayXXiJet, ArrayXXfJet, TimeSeriesbJet, TimeSeriesiJet, TimeSeriesfJet > State
defines the possible types of data flowing thru nodes.
TimeSeries< int > TimeSeriesi
namespace containing various utility functions
static map< V, K > reverse_map(const map< K, V > &m)
Given a map from keys to values, creates a new map from values to keys.
string to_string(const T &value)
template function to convert objects to string for logging
void unique_insert(Vector &v, const T &t)
unique insertion into a vector. allows a vector to be used like a set. source: http://www....
vector< size_t > mask_to_index(const ArrayXb &mask)
convert a boolean mask to an index array
< nsga2 selection operator for getting the front
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Eigen::Array< int, Eigen::Dynamic, Eigen::Dynamic > ArrayXXi
constexpr bool always_false_v
Eigen::Array< bool, Eigen::Dynamic, Eigen::Dynamic > ArrayXXb
map< DataType, string > DataTypeName
const map< DataType, std::type_index > DataTypeID
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
map< std::type_index, DataType > DataIDType
map< string, DataType > DataNameType