11 init_shogun_with_defaults();
71 log.open(
logfile, std::ofstream::app);
78 "/// Feature Engineering Automation Tool "
79 "* \xc2\xa9 La Cava et al 2017 "
86 "/////////////////////////////////////////////////////////////////////\n"
87 "// * Feature Engineering Automation Tool * //\n"
88 "// La Cava et al. 2017 //\n"
89 "// License: GPL v3 //\n"
90 "// https://cavalab.org/feat //\n"
91 "/////////////////////////////////////////////////////////////////////\n"
99 logger.
log(
"turning off batch because X has fewer than "
105 logger.
log(
"using batch with batch_size= "
179 logger.
log(
"Initializing population", 2);
196 logger.
log(
"Evaluating initial population",2);
203 vector<size_t> survivors;
211 unsigned stall_count = 0;
249 logger.
log(
"generation limit reached",2);
255 logger.
log(
"fitting final model to all training data...",2);
286 logger.
log(
"Run Completed. Total time taken is "
290 logger.
log(
"/// ----------------------------------------------------------------- \\\\\\",
483 vector<float> weights = ind.
ml->get_weights();
484 float offset = ind.
ml->get_bias();
492 vector<size_t> order(weights.size());
495 vector<float> aweights(weights.size());
496 for (
int i =0;
i<aweights.size(); ++
i)
497 aweights[
i] = fabs(weights[
i]);
498 order =
argsort(aweights,
false);
501 iota(order.begin(), order.end(), 0);
505 if (weights.size() > 0)
507 if (weights.at(order.at(0)) > 0)
511 for (
const auto& o : order)
515 output += features.at(o);
516 if (
i < order.size()-1)
518 if (weights.at(order.at(
i+1)) > 0)
530 vector<float> weights =
best_ind.
ml->get_weights();
538 vector<size_t> order(weights.size());
541 vector<float> aweights(weights.size());
542 for (
int i =0;
i<aweights.size(); ++
i)
543 aweights[
i] = fabs(weights[
i]);
544 order =
argsort(aweights,
false);
547 iota(order.begin(), order.end(), 0);
550 output +=
"Weight\tFeature\n";
551 output +=
to_string(offset) +
"\toffset" +
"\n";
552 for (
const auto& o : order)
556 output += features.at(o);
583 vector<Individual>* printed_pop = NULL;
608 idx.resize(printed_pop->size());
609 std::iota(idx.begin(), idx.end(), 0);
612 bool includes_best_ind =
false;
614 vector<json> json_archive;
616 for (
int i = 0;
i < idx.size(); ++
i)
624 json_archive.push_back(j);
626 if (
i < idx.size() -1)
630 includes_best_ind =
true;
634 if (!includes_best_ind)
638 json_archive.push_back(j);
652 ArrayXf w = ArrayXf::Map(tmpw.data(), tmpw.size());
657 std::map<string, std::pair<vector<ArrayXf>, vector<ArrayXf>>>
Feat::get_Z(
string s,
658 int * idx,
int idx_size)
661 vector<int> ids(idx,idx+idx_size);
676 vector<size_t> survivors,
680 unsigned& stall_count)
697 logger.
log(
"evaluating offspring...", 2);
706 logger.
log(
"shrinking pop to survivors...",2);
721 #pragma omp parallel for
750 logger.
log(
"finished with generation...",2);
773 shared_ptr<CLabels> yhat;
802 int starting_size = ind.
size();
804 vector<size_t> idx_to_remove;
806 logger.
log(
"\n=========\ndoing pattern pruning...",2);
812 int first_occurence = -2;
815 for (
int i = start ;
i <=
r; ++
i)
819 if (tmp_ind.
program.at(
i)->name.compare(
"not")==0)
821 if (first_occurence ==
i-1)
825 idx_to_remove.push_back(first_occurence);
826 idx_to_remove.push_back(
i);
828 first_occurence = -2;
838 std::reverse(idx_to_remove.begin(), idx_to_remove.end());
839 for (
auto idx: idx_to_remove)
845 int end_size = tmp_ind.
size();
846 logger.
log(
"pattern pruning reduced best model size by "
848 +
" nodes\n=========\n", 2);
859 int iterations = ind.
get_dim();
860 logger.
log(
"\n=========\ndoing correlation deletion mutations...",2);
861 starting_size = ind.
size();
862 VectorXf original_yhat;
866 original_yhat = ind.
yhat;
868 for (
int i = 0;
i < iterations; ++
i)
885 new_yhat = tmp_ind.
yhat;
888 if (((original_yhat - new_yhat).norm()/original_yhat.norm()
890 or perfect_correlation)
892 logger.
log(
"\ndelete dimension mutation success: went from "
896 -new_yhat).norm()/(original_yhat.norm()))
898 if (perfect_correlation)
904 logger.
log(
"\ndelete dimension mutation failure. Output changed by "
906 -new_yhat).norm()/(original_yhat.norm()))
914 end_size = ind.
size();
915 logger.
log(
"correlation pruning reduced best model size by "
917 +
" nodes\n=========\n", 2);
918 if (end_size < starting_size)
925 logger.
log(
"\n=========\ndoing subtree deletion mutations...", 2);
926 starting_size = ind.
size();
927 for (
int i = 0;
i < iterations; ++
i)
940 new_yhat = tmp_ind.
yhat;
942 if ((original_yhat - new_yhat).norm()/original_yhat.norm()
945 logger.
log(
"\ndelete mutation success: went from "
949 -new_yhat).norm()/(original_yhat.norm()))
955 logger.
log(
"\ndelete mutation failure. Output changed by "
957 -new_yhat).norm()/(original_yhat.norm()))
965 end_size = ind.
size();
966 logger.
log(
"subtree deletion reduced best model size by "
974 VectorXf difference = new_yhat - original_yhat;
992 vector<float> univariate_weights(d.
t->
X.rows() + d.
t->
Z.size(),0.0);
993 int N = d.
t->
X.cols();
995 MatrixXf predictor(1,
N);
997 "LR" :
"LinearRidgeRegression";
1003 logger.
log(
"univariate_initial_model",2);
1007 for (
unsigned i =0;
i<d.
t->
X.rows(); ++
i)
1009 predictor.row(0) = d.
t->
X.row(
i);
1013 shared_ptr<CLabels> yhat = ml.
fit(predictor, d.
t->
y, this->params,
1018 univariate_weights.at(
i) = 0;
1020 int j = d.
t->
X.rows();
1021 for (
const auto& val: d.
t->
Z)
1023 for (
int k = 0; k<
N; ++k)
1024 predictor(k) =
median(val.second.second.at(k));
1031 shared_ptr<CLabels> yhat = ml.
fit(predictor, d.
t->
y, this->params,
1034 univariate_weights.at(j) = ml.
get_weights().at(0);
1036 univariate_weights.at(j) = 0;
1040 return univariate_weights;
1052 int n_x = d.
t->
X.rows();
1053 int n_z = d.
t->
Z.size();
1057 bool univariate_initialization =
false;
1059 if (n_feats < (n_x + n_z))
1063 univariate_initialization =
true;
1067 vector<size_t> feature_order =
argsort(univariate_weights,
false);
1068 feature_order.erase(feature_order.begin()+n_feats,
1069 feature_order.end());
1071 for (
const auto& f : feature_order)
1087 for (
unsigned i =0;
i<n_x; ++
i)
1093 for (
unsigned i =0;
i<n_z; ++
i)
1102 shared_ptr<CLabels> yhat;
1105 if (univariate_initialization)
1117 vector<float> w =
best_ind.
ml->get_weights();
1162 VectorXf y = VectorXf();
1170 "before making predictions.");
1175 return ind->
out(d,
true).transpose();
1191 Data d_tmp(X, dummy, Z);
1210 VectorXf predictions(X.cols());
1213 Data tmp_data(X,empty_y,Z);
1238 +
to_string(
id) +
"in archive or population.");
1253 Data tmp_data(X,empty_y,Z);
1275 Data tmp_data(X,empty_y,Z);
1285 Data d_tmp(X, dummy, Z);
1301 vector<Individual>& pop_ref = (
use_arch ?
1304 bool updated =
false;
1306 for (
const auto& ind: pop_ref)
1313 || (f == bs && ind.get_complexity() < this->best_complexity)
1341 VectorXf losses(this->
pop.
size());
1345 losses(
i) = p.fitness;
1349 float min_loss = losses.minCoeff();
1352 float med_loss =
median(losses.array());
1355 ArrayXf Sizes(this->
pop.
size());
1361 Sizes(
i) = p.size();
1364 unsigned med_size =
median(Sizes);
1367 ArrayXf Complexities(this->
pop.
size());
1372 Complexities(
i) = p.get_complexity();
1377 ArrayXf Nparams(this->
pop.
size());
1381 Nparams(
i) = p.get_n_params();
1386 ArrayXf Dims(this->
pop.
size());
1390 Dims(
i) = p.get_dim();
1395 unsigned med_complexity =
median(Complexities);
1396 unsigned med_num_params =
median(Nparams);
1397 unsigned med_dim =
median(Dims);
1403 float med_loss_v =
median(val_fitnesses);
1433 unsigned num_models = std::min(50,this->
pop.
size());
1436 ArrayXf Sizes(this->
pop.
size());
1440 Sizes(
i) = p.size(); ++
i;
1442 unsigned max_size = Sizes.maxCoeff();
1444 string bar, space =
"";
1445 for (
unsigned int i = 0;
i<50; ++
i)
1447 if (
i <= 50*fraction) bar +=
"/";
1450 std::cout.precision(5);
1451 std::cout << std::scientific;
1455 <<
params.
gens <<
" [" + bar + space +
"]\n";
1457 std::cout << std::fixed <<
"Time elapsed "<<
timer
1460 <<
") [" + bar + space +
"]\n";
1462 std::cout << std::fixed <<
"Train Loss (Med): "
1465 <<
"Val Loss (Med): "
1467 <<
"Median Size (Max): "
1469 <<
"Time (s): " <<
timer <<
"\n";
1470 std::cout <<
"Representation Pareto Front--------------------------------------\n";
1471 std::cout <<
"Rank\t";
1474 cout <<
"fitness\tfitness_v\tcomplexity\t";
1475 cout <<
"Representation\n";
1477 std::cout << std::scientific;
1484 for (
unsigned i = 0;
i < num_models; ++
i)
1486 std::string lim_model;
1490 for (
unsigned j = 0; j< std::min(model.size(),
size_t(60)); ++j)
1492 lim_model.push_back(model.at(j));
1494 if (lim_model.size()==60)
1503 cout << lim_model <<
"\n";
1509 vector<size_t> fnew(2,0);
1510 while (f.size() < num_models && fnew.size()>1)
1513 f.insert(f.end(),fnew.begin(),fnew.end());
1516 for (
unsigned j = 0; j < std::min(num_models,
unsigned(f.size())); ++j)
1518 std::string lim_model;
1521 for (
unsigned j = 0; j< std::min(model.size(),
size_t(60)); ++j)
1522 lim_model.push_back(model.at(j));
1523 if (lim_model.size()==60)
1529 cout <<
"\t" << lim_model <<
"\n";
1542 log <<
"generation" << sep
1544 <<
"min_loss" << sep
1545 <<
"min_loss_val" << sep
1546 <<
"med_loss" << sep
1547 <<
"med_loss_val" << sep
1548 <<
"med_size" << sep
1549 <<
"med_complexity" << sep
1550 <<
"med_num_params" << sep
1551 <<
"med_dim" <<
"\n";
1599 std::ifstream indata;
1600 indata.open(filename);
1609 logger.
log(
"Loaded Feat state from " + filename,1);
1617 if (!filename.empty())
1620 out.open(
"Feat.json");
1622 out << this->
save();
1624 logger.
log(
"Saved Feat to file " + filename, 1);
void setTrainingData(MatrixXf &X_t, VectorXf &y_t, LongData &Z_t, bool c=false, vector< bool > protect=vector< bool >())
void train_test_split(bool shuffle, float split)
splits data into training and validation folds.
void setValidationData(MatrixXf &X_v, VectorXf &y_v, LongData &Z_v, bool c=false, vector< bool > protect=vector< bool >())
data holding X, y, and Z data
void get_batch(Data &db, int batch_size) const
select random subset of data for training weights.
void set_protected_groups()
evaluation mixin class for Feat
void validation(vector< Individual > &individuals, const Data &d, const Parameters ¶ms, bool offspring=false)
validation of population.
void fitness(vector< Individual > &individuals, const Data &d, const Parameters ¶ms, bool offspring=false)
fitness of population.
float score(const VectorXf &y_true, const shared_ptr< CLabels > &yhat, VectorXf &loss, const vector< float > &w)
void set_backprop(bool bp)
set constant optimization options
int get_dim()
get dimensionality of best
int get_max_size()
return max size of programs
void calculate_stats(const DataRef &d)
calculate and print stats
void set_selection(string sel)
set selection method
void set_root_xo_rate(float cross_rate)
set root xo rate in variation
ArrayXXf predict_proba(MatrixXf &X, LongData &Z)
predict probabilities of each class.
void load(const json &j)
load Feat state from a json string.
void set_random_state(int random_state)
set dimensionality as multiple of the number of columns
void set_corr_delete_mutate(bool s)
void update_stall_count(unsigned &stall_count, bool updated)
updates stall count for early stopping
string get_model(bool sort=true)
return best model, in tabular form
void set_gens(int gens)
set size of max generations
void set_split(float sp)
set train fraction of dataset
float score(MatrixXf &X, const VectorXf &y, LongData Z=LongData())
scoring function
void load_best_ind(string filename)
load best_ind from file
void set_dtypes(vector< char > dtypes)
set data types for input parameters
int save_pop
controls whether pop is printed each gen
void set_erc(bool erc)
flag to set whether to use variable or constants for terminals
void save_to_file(string filename)
save Feat state to file.
string starting_pop
file with starting population
float simplify
post-run simplification
void set_classification(bool classification)
set EProblemType for shogun
float get_split()
return fraction of data to use for training
void set_scorer(string s)
set scoring function
int get_max_depth()
return max_depth of programs
void set_fb(float fb)
set feedback
int best_complexity
complexity of the best model
string get_ind_eqn(bool sort, Individual &ind)
return best model as a single line equation
void load_population(string filename, bool justfront=false)
load population from file, optionall just Pareto front
void set_max_time(int time)
set max time in seconds for fit method
float min_loss
current best score
void set_is_fitted(bool f)
set flag indicating whether fit has been called
Population pop
population of programs
json save() const
save and return a json Feat state as string.
bool use_arch
internal control over use of archive
string get_representation()
return best model
ArrayXf get_coefs()
return the coefficients or importance scores of the best model.
vector< float > univariate_initial_model(DataRef &d, int n_feats)
int get_n_params()
get number of parameters in best
vector< char > get_otypes()
return program output type ('f', 'b')
void print_stats(std::ofstream &log, float fraction)
void set_protected_groups(string pg)
set protected groups for fairness
void set_use_batch()
set flag to use batch for training
void set_simplify(float s)
void initial_model(DataRef &d)
method to fit inital ml model
bool get_erc()
return boolean value of erc flag
int get_verbosity()
return current verbosity level set
void run_generation(unsigned int g, vector< size_t > survivors, DataRef &d, std::ofstream &log, float percentage, unsigned &stall_count)
MatrixXf transform(MatrixXf &X)
transform an input matrix using a program.
bool val_from_arch
model selection only uses Pareto front
bool get_classification()
return type of classification flag set
Individual best_ind
best individual
int get_max_stall()
return maximum stall in learning, in generations
VectorXf predict_archive(int id, MatrixXf &X)
predict on unseen data from the whole archive
nl::json get_stats()
return statistics from the run as a json string
void load_from_file(string filename)
load Feat state from file.
int get_complexity()
get dimensionality of best
bool update_best(const DataRef &d, bool val=false)
updates best score
void final_model(DataRef &d)
fits final model to best transformation
Log_Stats stats
runtime stats
int get_pop_size()
return population size
void set_iters(int iters)
string get_eqn(bool sort=false)
vector< nl::json > get_archive(bool front)
return population as string
void set_pop_size(int pop_size)
set size of population
void set_n_jobs(unsigned t)
set number of threads
Variation variator
variation operators
string survival
stores survival mode
void set_hillclimb(bool hc)
void set_otype(char ot)
set program output type ('f', 'b')
LongData get_Z(string s, int *idx, int idx_size)
get longitudinal data from file s
void set_ml(string ml)
set ML algorithm to use
int get_max_dim()
return max dimensionality of programs
Evaluation evaluator
evaluation code
void set_logfile(string s)
set name for files
void log_stats(std::ofstream &log)
shared_ptr< CLabels > predict_labels(MatrixXf &X, LongData Z=LongData())
predict on unseen data. return CLabels.
Selection selector
selection algorithm
int get_gens()
return size of max generations
void set_shuffle(bool sh)
flag to shuffle the input samples for train/test splits
float min_loss_v
best validation score
int get_n_nodes()
return the number of nodes in the best model
string get_logfile()
get name
string logfile
log filename
Parameters params
hyperparameters of Feat
string get_ml()
return ML algorithm string
void set_max_dim(unsigned int max_dim)
set maximum dimensionality of programs
int get_num_features()
return number of features
Archive archive
pareto front archive
void set_max_depth(unsigned int max_depth)
set max depth of programs
void init()
initialize Feat object for fitting.
void set_verbosity(int verbosity)
set level of debug info
void set_survival(string surv)
set survivability
ArrayXXf predict_proba_archive(int id, MatrixXf &X, LongData &Z)
void set_cross_rate(float cross_rate)
set cross rate in variation
void set_batch_size(int bs)
vector< char > get_dtypes()
return data types for input parameters
Selection survivor
survival algorithm
float get_fb()
get feedback setting
Timer timer
start time of training
bool get_shuffle()
return whether option to shuffle the data is set or not
float get_cross_rate()
return cross rate for variation
VectorXf predict(MatrixXf &X, LongData &Z)
predict on unseen data.
void simplify_model(DataRef &d, Individual &)
simplifies final model to best transformation
Normalizer N
scales training data.
void fit(MatrixXf &X, VectorXf &y)
train a model.
void set_max_stall(int max_stall)
set maximum stall in learning, in generations
class that specifies the machine learning algorithm to pair with Feat.
vector< float > get_weights(bool norm_adjust=true) const
shared_ptr< CLabels > fit(const MatrixXf &X, const VectorXf &y, const Parameters ¶ms, bool &pass, const vector< char > &dtypes=vector< char >())
individual programs in the population
vector< string > get_features()
return vectorized representation of program
int size() const
return size of program
MatrixXf out(const Data &d, bool predict=false)
calculate program output matrix Phi
VectorXf yhat
current output
string get_eqn()
return symbolic representation of program
ArrayXXf predict_proba(const Data &d)
MatrixXf Phi
transformation output of program
int get_n_params()
get number of params in program
float fitness
aggregate fitness score
NodeVector program
executable data structure
shared_ptr< ML > ml
ML model, trained on Phi.
void save(string filename)
save individual as a json object.
shared_ptr< CLabels > predict(const Data &d)
shared_ptr< CLabels > fit(const Data &d, const Parameters ¶ms, bool &pass)
fits an ML model to the data after transformation
unsigned int get_complexity() const
get the program complexity without updating it.
VectorXf predict_vector(const Data &d)
void load(string filename)
load individual from a file.
unsigned int get_dim()
grab sub-tree locations given starting point.
shared_ptr< CLabels > fit_tune(const Data &d, const Parameters ¶ms, bool set_default=false)
fits and tunes an ML model to the data after transformation
string log(string m, int v, string sep="\n") const
print message with verbosity control.
void set_seed(int new_seed)
std::chrono::duration< float > Elapsed() const
void delete_mutate(Individual &child, const Parameters ¶ms)
void vary(Population &pop, const vector< size_t > &parents, const Parameters ¶ms, const Data &d)
method to handle variation of population
bool correlation_delete_mutate(Individual &child, MatrixXf Phi, const Parameters ¶ms, const Data &d)
void set_cross_rate(float cr)
update cross rate
std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > LongData
#define THROW_RUNTIME_ERROR(err)
#define THROW_INVALID_ARGUMENT(err)
void __attribute__((constructor)) ctor()
#define initialize_cuda()
#define omp_set_num_threads(x)
void my_handler(int s)
handle signals (ctr-c etc.)
void load_partial_longitudinal(const std::string &path, std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > &Z, char sep, const vector< int > &idx)
load partial longitudinal csv file into matrix according to idx vector
float median(const ArrayXf &v)
calculate median
vector< size_t > argsort(const vector< T > &v, bool ascending=true)
return indices that sort a vector
void printProgress(float percentage)
outputs a progress bar, filled according to
std::string to_string(const T &value)
template function to convert objects to string for logging
void from_json(const nl::json &, Feat &)
void to_json(nl::json &, const Feat &)
bool use_batch
whether to use mini batch for training
bool backprop
turns on backpropagation
void set_terminals(int nf, const LongData &Z)
set the terminals with longitudinal data
void set_sample_weights(VectorXf &y)
sets the weights of each sample (and class weights)
vector< char > dtypes
data types of input parameters
unsigned int max_size
max size of programs (length)
void set_current_gen(int g)
sets current generation
unsigned int max_dim
maximum dimensionality of programs
bool classification
flag to conduct classification rather than
int max_time
max time for fit method
int max_stall
maximum stall in learning, in generations
int n_jobs
number of parallel jobs
float cross_rate
cross rate for variation
float feedback
strength of ml feedback on probabilities
vector< float > class_weights
weights for each class
bool hillclimb
turns on parameter hill climbing
unsigned int n_classes
number of classes for classification
string scorer
loss function argument
void init(const MatrixXf &X, const VectorXf &y)
bool erc
whether to include constants for terminals
void set_max_dim(unsigned int max_dim)
set maximum dimensionality of programs
unsigned int max_depth
max depth of programs
string ml
machine learner used with Feat
void set_max_depth(unsigned int max_depth)
set max depth of programs
int pop_size
population size
HC hc
stochastic hill climbing parameters
unsigned num_features
number of features
bool tune_initial
tune initial ML model
int current_gen
holds current generation
bool normalize
whether to normalize the input data
vector< string > objectives
Pareto objectives.
float root_xo_rate
crossover
NodeVector terminals
terminal nodes available in programs vector storing longitudinal data keys
void set_verbosity(int verbosity)
set level of debug info
vector< bool > protected_groups
protected attributes in X
void set_protected_groups(string fn)
vector< char > otypes
program output types ('f', 'b')
float split
fraction of data to use for training
bool shuffle
option to shuffle the data
void set_term_weights(const vector< float > &w)
sets weights for terminals.
void set_scorer(string sc="", bool initialized=false)
sets scorer type
bool tune_final
tune final ML model string of comma-delimited operator names, used to choose functions
string scorer_
actual loss function used, determined by scorer
bool corr_delete_mutate
use correlation delete mutation
int random_state
random seed
void update(const Population &pop, const Parameters ¶ms)
vector< Individual > individuals
individual programs in the archive
void set_objectives(vector< string > objectives)
vector< size_t > roots() const
returns indices of root nodes
size_t subtree(size_t i, char otype='0', string indent="> ") const
Defines a population of programs and functions for constructing them.
void load(string filename)
void update(vector< size_t > survivors)
reduce programs to the indices in survivors.
string print_eqns(bool just_offspring=false, string sep="\n")
return population equations.
int size()
returns population size
vector< size_t > sorted_front(unsigned)
return complexity-sorted Pareto front indices.
void init(const Individual &starting_model, const Parameters ¶ms, bool random=false, string filename="")
initialize population of programs with a starting model and/or from file
vector< Individual > individuals
individual programs
void save(string filename)
interfaces with selection operators.
string get_type()
return type of selectionoperator
vector< size_t > survive(Population &pop, const Parameters ¶ms, const Data &d)
perform survival
vector< size_t > select(Population &pop, const Parameters ¶ms, const Data &d)
perform selection
vector< unsigned > med_size
vector< float > med_loss_v
vector< unsigned > med_num_params
vector< unsigned > med_dim
void update(int index, float timer_count, float bst_score, float bst_score_v, float md_score, float md_loss_v, unsigned md_size, unsigned md_complexity, unsigned md_num_params, unsigned md_dim)
vector< unsigned > med_complexity
normalizes a matrix to unit variance, 0 mean centered.
void fit_normalize(MatrixBase< T > &X, const vector< char > &dtypes)
fit then normalize
void normalize(MatrixBase< T > &X) const
normalize matrix.