Feat C++ API
A feature engineering automation tool
feat.h
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 #ifndef FEAT_H
6 #define FEAT_H
7 
8 //external includes
9 #include <iostream>
10 #include <vector>
11 #include <memory>
12 #include <shogun/base/init.h>
13 
14 // internal includes
15 #include "init.h"
16 #include "util/rnd.h"
17 #include "util/logger.h"
18 #include "util/utils.h"
19 #include "util/io.h"
20 #include "params.h"
21 #include "pop/population.h"
22 #include "sel/selection.h"
23 #include "eval/evaluation.h"
24 #include "vary/variation.h"
25 #include "model/ml.h"
26 #include "pop/op/node.h"
27 #include "pop/archive.h"
29 
30 #ifdef USE_CUDA
31  #include "pop/cuda-op/cuda_utils.h"
32  #define GPU true
33 #else
34  #define GPU false
35  #define initialize_cuda() 0
36 #endif
37 
38 // stuff being used
39 using Eigen::MatrixXf;
40 using Eigen::ArrayXXf;
41 using Eigen::VectorXf;
42 using Eigen::ArrayXf;
43 typedef Eigen::Array<bool,Eigen::Dynamic,1> ArrayXb;
44 using std::vector;
45 using std::string;
46 using std::unique_ptr;
47 using std::shared_ptr;
48 using std::make_shared;
49 using std::cout;
50 namespace nl = nlohmann;
51 
56 namespace FT{
57 
58 using namespace Eval;
59 using namespace Vary;
60 
62 
72 class Feat
73 {
74  public :
75 
76  // Methods
77 
79 
80  // Feat(int pop_size=100, int gens = 100,
81  // string ml = "LinearRidgeRegression",
82  // bool classification = false, int verbosity = 2,
83  // int max_stall = 0, string sel ="lexicase",
84  // string surv="nsga2", float cross_rate = 0.5,
85  // float root_xo_rate = 0.5, char otype='a',
86  // string functions = "", unsigned int max_depth = 3,
87  // unsigned int max_dim = 10, int random_state=-1,
88  // bool erc = false, string obj="fitness,complexity",
89  // bool shuffle=true, float split=0.75, float fb=0.5,
90  // string scorer="", string feature_names="",
91  // bool backprop=false,int iters=10, float lr=0.1,
92  // int batch_size=0, int n_jobs=0, bool hillclimb=false,
93  // string logfile="", int max_time=-1, bool residual_xo = false,
94  // bool stagewise_xo = false, bool stagewise_tol = true,
95  // bool softmax_norm=false, int save_pop=0, bool normalize=true,
96  // bool val_from_arch=true, bool corr_delete_mutate=false,
97  // float simplify=0.0, string protected_groups="",
98  // bool tune_initial=false, bool tune_final=true,
99  // string starting_pop="");
100 
101  Feat(){}
103  ~Feat(){}
104 
105  void init();
107  inline void set_is_fitted(bool f){is_fitted=f;}
108  inline bool get_is_fitted(){return is_fitted;}
109 
111  void set_pop_size(int pop_size);
113  int get_pop_size();
114 
116  void set_gens(int gens);
118  int get_gens();
119 
121  void set_ml(string ml);
123  string get_ml();
124 
126  void set_classification(bool classification);
128  bool get_classification();
129 
131  void set_verbosity(int verbosity);
133  int get_verbosity();
134 
136  void set_max_stall(int max_stall);
138  int get_max_stall();
139 
141  void set_selection(string sel);
142 
144  void set_survival(string surv);
145 
147  float get_cross_rate();
149  void set_cross_rate(float cross_rate);
150 
152  void set_root_xo_rate(float cross_rate);
153  float get_root_xo_rate(){return this->params.root_xo_rate;};
154 
156  vector<char> get_otypes();
158  char get_otype(){return params.otype;};
160  void set_otype(char ot);
161 
163  void set_functions(const vector<string>& fns){ params.set_functions(fns); };
164  vector<string> get_functions(){return params.get_functions();};
165 
167  int get_max_depth();
169  void set_max_depth(unsigned int max_depth);
170 
172  int get_max_dim();
174  void set_max_dim(unsigned int max_dim);
175 
177  // void set_max_dim(string str);
178 
180  void set_random_state(int random_state);
181  int get_random_state() { return params.random_state; };
183  int get_random_state_() { return r.get_seed(); };
184 
186  bool get_erc();
188  void set_erc(bool erc);
189 
191  bool get_shuffle();
193  void set_shuffle(bool sh);
194 
195 
197  float get_split();
199  void set_split(float sp);
200 
202  vector<char> get_dtypes();
204  void set_dtypes(vector<char> dtypes);
205 
207  float get_fb();
209  void set_fb(float fb);
210 
212  string get_logfile();
214  void set_logfile(string s);
215 
216  // returns the input argument for scorer.
217  string get_scorer();
219  void set_scorer(string s);
220  // returns the actual scorer determined by the input argument.
221  string get_scorer_();
222 
223  void set_feature_names(string s){params.set_feature_names(s); };
224  string get_feature_names(){return params.get_feature_names(); };
225 
227  void set_backprop(bool bp);
228  bool get_backprop(){return params.backprop;};
229 
230  void set_simplify(float s);
231  float get_simplify(){return simplify;};
232 
233  void set_corr_delete_mutate(bool s);
234  bool get_corr_delete_mutate(){return params.corr_delete_mutate;};
235 
236  void set_hillclimb(bool hc);
237  bool get_hillclimb(){return params.hillclimb;};
238 
239  void set_iters(int iters);
240  int get_iters(){return params.bp.iters;};
241 
242  void set_lr(float lr);
243  float get_lr(){return params.bp.learning_rate;};
244 
245  int get_batch_size(){return params.bp.batch_size;};
246  void set_batch_size(int bs);
247 
249  void set_n_jobs(unsigned t);
250  int get_n_jobs(){return omp_get_num_threads();};
251 
253  void set_max_time(int time);
254  int get_max_time(){return params.max_time;};
255 
257  void set_use_batch();
258 
260  void set_residual_xo(bool res_xo=true){params.residual_xo=res_xo;};
261  bool get_residual_xo(){return params.residual_xo;};
262 
264  void set_stagewise_xo(bool sem_xo=true){ params.stagewise_xo=sem_xo; };
265  bool get_stagewise_xo(){return params.stagewise_xo;};
266 
267  void set_stagewise_xo_tol(int tol){ params.stagewise_xo_tol = tol;};
268  int get_stagewise_xo_tol(){return params.stagewise_xo_tol;};
269 
271  void set_softmax_norm(bool sftmx=true){params.softmax_norm=sftmx;};
272  bool get_softmax_norm(){return params.softmax_norm;};
273 
274  void set_save_pop(int pp){ save_pop=pp; };
275  int get_save_pop(){ return save_pop; };
276 
277  void set_starting_pop(string sp){ starting_pop=sp; };
278  string get_starting_pop(){ return starting_pop; };
279 
280  void set_normalize(bool in){params.normalize = in;};
281  bool get_normalize(){return params.normalize;};
282 
283  string get_sel(){return this->selector.get_type();};
284  void set_sel(string in){this->selector.set_type(in); };
285 
286  string get_surv(){return this->survivor.get_type();};
287  void set_surv(string in){this->survivor.set_type(in); };
288 
289  bool get_tune_initial(){ return this->params.tune_initial;};
290  void set_tune_initial(bool in){ this->params.tune_initial = in;};
291 
292  bool get_tune_final(){ return this->params.tune_final;};
293  void set_tune_final(bool in){ this->params.tune_final = in;};
294 
295 
297  auto get_objectives(){return params.get_objectives(); };
299  void set_objectives(const vector<string>& obj){params.set_objectives(obj);};
300 
301  string get_protected_groups(){ return params.get_protected_groups(); };
303  void set_protected_groups(string pg);
304 
305  bool get_val_from_arch(){return val_from_arch; };
306  void set_val_from_arch(bool in){val_from_arch = in; };
307 
308  /*
309  * solo getters
310  */
312  int get_archive_size(){ return this->archive.individuals.size(); };
314  int get_max_size();
316  int get_num_features();
318  string get_representation();
320  string get_model(bool sort=true);
322  string get_ind_eqn(bool sort, Individual& ind);
323  string get_eqn(bool sort=false);
325  int get_n_params();
327  int get_dim();
329  int get_complexity();
331  vector<nl::json> get_archive(bool front);
333  ArrayXf get_coefs();
335  int get_n_nodes();
337  LongData get_Z(string s,
338  int * idx, int idx_size);
339 
340 
341 
343  void fit(MatrixXf& X, VectorXf& y);
344  void fit(MatrixXf& X, VectorXf& y, LongData& Z);
345 
346  void run_generation(unsigned int g,
347  vector<size_t> survivors,
348  DataRef &d,
349  std::ofstream &log,
350  float percentage,
351  unsigned& stall_count);
352 
354  VectorXf predict(MatrixXf& X, LongData& Z);
355  VectorXf predict(MatrixXf& X);
356 
358  VectorXf predict_archive(int id, MatrixXf& X);
359  VectorXf predict_archive(int id, MatrixXf& X, LongData& Z);
360  ArrayXXf predict_proba_archive(int id, MatrixXf& X, LongData& Z);
361  ArrayXXf predict_proba_archive(int id, MatrixXf& X);
362 
364  shared_ptr<CLabels> predict_labels(MatrixXf& X, LongData Z = LongData());
365 
367  ArrayXXf predict_proba(MatrixXf& X, LongData& Z);
368  ArrayXXf predict_proba(MatrixXf& X);
369 
371  MatrixXf transform(MatrixXf& X);
372  MatrixXf transform(MatrixXf& X, LongData& Z);
373  MatrixXf transform(MatrixXf& X, LongData Z, Individual *ind);
374 
376  float score(MatrixXf& X, const VectorXf& y,
377  LongData Z = LongData());
378 
380  nl::json get_stats();
381 
383  void load_best_ind(string filename);
384 
386  void load_population(string filename, bool justfront=false);
387 
389  // void load(const string& feat_state);
390  void load(const json& j);
392  void load_from_file(string filename);
394  json save() const;
396  void save_to_file(string filename);
397 
398  bool is_fitted;
399  private:
400  // Parameters
402 
404  // subclasses for main steps of the evolutionary routine
411  bool use_arch;
412  string survival;
414  // performance tracking
415  float min_loss;
416  float min_loss_v;
419  string str_dim;
420  string starting_pop;
422  string logfile;
423  int save_pop;
425  float simplify;
427 
428  /* functions */
430  bool update_best(const DataRef& d, bool val=false);
431 
433  void calculate_stats(const DataRef& d);
434  void print_stats(std::ofstream& log,
435  float fraction);
436  void log_stats(std::ofstream& log);
437 
438  // gets weights via univariate initial models
439  vector<float> univariate_initial_model(DataRef &d, int n_feats);
441  void initial_model(DataRef &d);
443  void final_model(DataRef& d);
445  void simplify_model(DataRef& d, Individual&);
447  void update_stall_count(unsigned& stall_count, bool updated);
448 
449  //serialization
451  params,
452  pop,
453  selector,
454  survivor,
455  archive,
456  use_arch,
457  survival,
458  N,
459  min_loss,
460  min_loss_v,
461  best_med_score,
462  best_complexity,
463  str_dim,
464  starting_pop,
465  best_ind,
466  is_fitted
467  );
468 
469 };
470 
471 // forward declarations
472 void to_json(nl::json&, const Feat&);
473 void from_json(const nl::json&, Feat&);
474 
475 // serialization
476 } // FT
477 #endif
evaluation mixin class for Feat
Definition: evaluation.h:34
main class for the Feat learner.
Definition: feat.h:73
void set_functions(const vector< string > &fns)
sets available functions based on comma-separated list.
Definition: feat.h:163
void set_starting_pop(string sp)
Definition: feat.h:277
int get_archive_size()
return archive size
Definition: feat.h:312
void set_normalize(bool in)
Definition: feat.h:280
bool get_tune_initial()
Definition: feat.h:289
void set_feature_names(string s)
Definition: feat.h:223
~Feat()
destructor
Definition: feat.h:103
void set_tune_initial(bool in)
Definition: feat.h:290
bool get_normalize()
Definition: feat.h:281
float get_root_xo_rate()
Definition: feat.h:153
void set_tune_final(bool in)
Definition: feat.h:293
int get_n_jobs()
Definition: feat.h:250
void set_surv(string in)
Definition: feat.h:287
int save_pop
controls whether pop is printed each gen
Definition: feat.h:423
string starting_pop
file with starting population
Definition: feat.h:420
float simplify
post-run simplification
Definition: feat.h:425
void set_val_from_arch(bool in)
Definition: feat.h:306
int get_iters()
Definition: feat.h:240
bool get_softmax_norm()
Definition: feat.h:272
int best_complexity
complexity of the best model
Definition: feat.h:418
float min_loss
current best score
Definition: feat.h:415
float get_simplify()
Definition: feat.h:231
void set_is_fitted(bool f)
set flag indicating whether fit has been called
Definition: feat.h:107
Population pop
population of programs
Definition: feat.h:405
bool use_arch
internal control over use of archive
Definition: feat.h:411
bool get_stagewise_xo()
Definition: feat.h:265
int get_random_state_()
returns the actual seed determined by the input argument.
Definition: feat.h:183
auto get_objectives()
get objectives for multi-objective search
Definition: feat.h:297
char get_otype()
return parameter otype, used to set otypes
Definition: feat.h:158
bool val_from_arch
model selection only uses Pareto front
Definition: feat.h:424
Individual best_ind
best individual
Definition: feat.h:421
string get_sel()
Definition: feat.h:283
float get_lr()
Definition: feat.h:243
string get_surv()
Definition: feat.h:286
Log_Stats stats
runtime stats
Definition: feat.h:426
bool get_corr_delete_mutate()
Definition: feat.h:234
string get_protected_groups()
Definition: feat.h:301
void set_residual_xo(bool res_xo=true)
use residual crossover
Definition: feat.h:260
Variation variator
variation operators
Definition: feat.h:408
string str_dim
dimensionality as multiple of number of cols
Definition: feat.h:419
string survival
stores survival mode
Definition: feat.h:412
void set_objectives(const vector< string > &obj)
set objectives for multi-objective search
Definition: feat.h:299
int get_max_time()
Definition: feat.h:254
void set_stagewise_xo_tol(int tol)
Definition: feat.h:267
Evaluation evaluator
evaluation code
Definition: feat.h:407
string get_feature_names()
Definition: feat.h:224
bool get_is_fitted()
Definition: feat.h:108
bool get_residual_xo()
Definition: feat.h:261
Selection selector
selection algorithm
Definition: feat.h:406
float min_loss_v
best validation score
Definition: feat.h:416
bool get_backprop()
Definition: feat.h:228
string logfile
log filename
Definition: feat.h:422
Parameters params
hyperparameters of Feat
Definition: feat.h:401
bool get_hillclimb()
Definition: feat.h:237
bool get_tune_final()
Definition: feat.h:292
int get_stagewise_xo_tol()
Definition: feat.h:268
void set_sel(string in)
Definition: feat.h:284
bool is_fitted
keeps track of whether fit was called.
Definition: feat.h:398
Archive archive
pareto front archive
Definition: feat.h:410
void set_softmax_norm(bool sftmx=true)
use softmax
Definition: feat.h:271
vector< string > get_functions()
Definition: feat.h:164
void set_stagewise_xo(bool sem_xo=true)
use stagewise crossover
Definition: feat.h:264
NLOHMANN_DEFINE_TYPE_INTRUSIVE(Feat, params, pop, selector, survivor, archive, use_arch, survival, N, min_loss, min_loss_v, best_med_score, best_complexity, str_dim, starting_pop, best_ind, is_fitted)
string get_starting_pop()
Definition: feat.h:278
float best_med_score
best median population score
Definition: feat.h:417
int get_save_pop()
Definition: feat.h:275
Feat()
member initializer list constructor
Definition: feat.h:101
Selection survivor
survival algorithm
Definition: feat.h:409
Timer timer
start time of training
Definition: feat.h:403
bool get_val_from_arch()
Definition: feat.h:305
int get_random_state()
Definition: feat.h:181
int get_batch_size()
Definition: feat.h:245
Normalizer N
scales training data.
Definition: feat.h:413
void set_save_pop(int pp)
Definition: feat.h:274
individual programs in the population
Definition: individual.h:31
int get_seed()
Definition: rnd.h:40
class for timing things.
Definition: utils.h:104
std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > LongData
Definition: data.h:23
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition: feat.h:43
#define omp_get_num_threads()
Definition: init.h:13
T pop(vector< T > *v)
Definition: auto_backprop.h:49
bool in(const vector< T > v, const T &i)
check if element is in vector.
Definition: utils.h:47
static Rnd & r
Definition: rnd.h:135
main Feat namespace
Definition: data.cc:13
void from_json(const nl::json &, Feat &)
void to_json(nl::json &, const Feat &)
holds the hyperparameters for Feat.
Definition: params.h:25
Defines a population of programs and functions for constructing them.
Definition: population.h:28
interfaces with selection operators.
Definition: selection.h:36
normalizes a matrix to unit variance, 0 mean centered.
Definition: utils.h:147