Feat C++ API
A feature engineering automation tool
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ml.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "ml.h"
7 
8 using namespace shogun;
9 
10 namespace FT{
11 namespace Model{
12 
13 // global default ML parameters
14 map<ML_TYPE, float> C_DEFAULT = {
15  {LARS, 0},
16  {Ridge, 1e-6},
17  {LR, 1.0},
18  {L1_LR, 1.0}
19 };
20 
21 ML::ML(string ml, bool norm, bool classification, int n_classes)
22 {
26  ml_hash["Lasso"] = LARS;
27  ml_hash["LinearRidgeRegression"] = Ridge;
28  ml_hash["Ridge"] = Ridge;
29  ml_hash["SVM"] = SVM;
30  ml_hash["RandomForest"] = RF;
31  ml_hash["CART"] = CART;
32  ml_hash["LR"] = LR;
33  ml_hash["L2_LR"] = LR;
34  ml_hash["L1_LR"] = L1_LR;
35  ml_hash["RF"] = RF;
36 
37  ml_str = ml;
38 
39  if ( ml_hash.find(ml) == ml_hash.end() )
40  {
41  // not found
42  THROW_INVALID_ARGUMENT("ml type '" + ml + "' not defined");
43  }
44  else
45  this->ml_type = ml_hash.at(ml);
46 
47  this->prob_type = PT_REGRESSION;
48  max_train_time=30;
49  normalize = norm;
50  if (classification)
51  {
52  if (n_classes==2)
53  this->prob_type = PT_BINARY;
54  else
55  this->prob_type = PT_MULTICLASS;
56  }
57  this->C = C_DEFAULT.at(ml_type);
58  this->init(true);
59 
60  // force normalize to be ON if using a linear model; improves stability
61  if ( in({LARS, Ridge, SVM, LR, L1_LR}, ml_type)
62  && !normalize
63  )
64  {
65  this->normalize = true;
66  logger.log("Using ML normalization since a linear method was specified",
67  3);
68  }
69 }
70 
71 void ML::init(bool assign_p_est)
72 {
73  if (assign_p_est)
74  this->p_est.reset();
75  // set up ML based on type
76  if (ml_type == LARS)
77  {
78  if (assign_p_est)
79  p_est = make_shared<sh::CLeastAngleRegression>(true);
80  dynamic_pointer_cast<sh::CLeastAngleRegression>(
81  p_est)->set_max_non_zero(int(this->C));
82  }
83  else if (ml_type == Ridge)
84  {
85  if (assign_p_est)
86  p_est = make_shared<sh::CLinearRidgeRegression>();
87  // auto typed_p_est = typed<sh::CLinearRidgeRegression>();
88  auto typed_p_est = dynamic_pointer_cast<sh::CLinearRidgeRegression>(
89  p_est);
90  typed_p_est->set_compute_bias(true);
91  typed_p_est->set_tau(this->C);
92  }
93  else if (ml_type == RF)
94  {
95  if (assign_p_est)
96  p_est = make_shared<sh::CMyRandomForest>();
97  auto typed_p_est = dynamic_pointer_cast<sh::CMyRandomForest>(p_est);
98  typed_p_est->set_machine_problem_type(this->prob_type);
99  typed_p_est->set_num_bags(10);
100 
101  if (this->prob_type != PT_REGRESSION)
102  {
103  auto CR = some<sh::CMajorityVote>();
104  typed_p_est->set_combination_rule(CR);
105  }
106  else
107  {
108  auto CR = some<sh::CMeanRule>();
109  typed_p_est->set_combination_rule(CR);
110  }
111 
112  }
113  else if (ml_type == CART)
114  {
115  if (assign_p_est)
116  p_est = make_shared<sh::CMyCARTree>();
117  dynamic_pointer_cast<sh::CMyCARTree>(
118  p_est)->set_machine_problem_type(this->prob_type);
119  dynamic_pointer_cast<sh::CMyCARTree>(
120  p_est)->set_max_depth(6);
121  }
122 
123  else if (ml_type == SVM)
124  {
125  if(this->prob_type==PT_BINARY)
126  if (assign_p_est)
127  p_est = make_shared<sh::CMyLibLinear>(
128  sh::L2R_L2LOSS_SVC_DUAL);
129  else if (this->prob_type==PT_MULTICLASS){
130  if (assign_p_est)
131  p_est = make_shared<CMyMulticlassLibLinear>();
132  dynamic_pointer_cast<CMyMulticlassLibLinear>(
133  p_est)->set_prob_heuris(sh::OVA_SOFTMAX);
134 
135  }
136  else // SVR
137  if (assign_p_est)
138  p_est = make_shared<sh::CLibLinearRegression>();
139 
140  }
141  else if (in({LR, L1_LR}, ml_type))
142  {
143  assert(this->prob_type!=PT_REGRESSION
144  && "LR only works with classification.");
145  if (this->prob_type == PT_BINARY){
146  if (this->ml_type == LR)
147  {
148  if (assign_p_est)
149  p_est = make_shared<sh::CMyLibLinear>(
150  sh::L2R_LR);
151  }
152  else
153  {
154  if (assign_p_est)
155  p_est = make_shared<sh::CMyLibLinear>(sh::L1R_LR);
156  }
157 
158  auto typed_p_est = dynamic_pointer_cast<sh::CMyLibLinear>(p_est);
159  // setting parameters to match sklearn defaults
160  typed_p_est->set_bias_enabled(true);
161  typed_p_est->set_epsilon(0.0001);
162  typed_p_est->set_max_iterations(100);
163  typed_p_est->set_C(this->C,this->C);
164  }
165  else // multiclass
166  {
167  if (assign_p_est)
168  p_est = make_shared<sh::CMulticlassLogisticRegression>();
169 
170  auto typed_p_est = \
171  dynamic_pointer_cast<sh::CMulticlassLogisticRegression>(p_est);
172  typed_p_est->set_prob_heuris(sh::OVA_SOFTMAX);
173  typed_p_est->set_z(this->C);
174  typed_p_est->set_epsilon(0.0001);
175  typed_p_est->set_max_iter(100);
176 
177  }
178 
179 
180  }
181  else
182  THROW_INVALID_ARGUMENT("'" + ml_str + "' is not a valid ml choice\n");
183  // set maximum training time per model
184  p_est->set_max_train_time(max_train_time);
185 }
186 
187 ML::~ML()
188 {
189  this->p_est.reset();
190 }
191 
192 void ML::set_dtypes(const vector<char>& dtypes)
193 {
194  if (ml_type == CART || ml_type == RF)
195  {
196  // set attribute types True if boolean, False if
197  // continuous/ordinal
198  sh::SGVector<bool> dt(dtypes.size());
199  for (unsigned i = 0; i< dtypes.size(); ++i)
200  dt[i] = dtypes.at(i) == 'b';
201  if (ml_type == CART)
202  dynamic_pointer_cast<sh::CMyCARTree>(
203  p_est)->set_feature_types(dt);
204  else if (ml_type == RF)
205  dynamic_pointer_cast<sh::CMyRandomForest>(
206  p_est)->set_feature_types(dt);
207  }
208  this->dtypes = dtypes;
209 }
210 
211 vector<float> ML::get_weights(bool norm_adjust) const
212 {
216  vector<double> w;
217 
218  if (in({LARS, Ridge, SVM, LR, L1_LR}, ml_type))
219  {
220  /* For multiclass, return the average weight magnitude over
221  * the OVR models. These weights are normalized.
222  */
223  if(this->prob_type == PT_MULTICLASS
224  && in({LR, L1_LR, SVM}, ml_type) )
225  {
226  vector<SGVector<double>> weights;
227 
228  if( in({LR, L1_LR}, ml_type))
229  weights = dynamic_pointer_cast<
230  sh::CMulticlassLogisticRegression>(p_est)->get_w();
231  else //SVM
232  weights = \
233  dynamic_pointer_cast<sh::CMyMulticlassLibLinear>(
234  p_est)->get_w();
235 
236  if (weights.empty())
237  return vector<float>();
238 
239  w = vector<double>(weights.at(0).size());
240 
241  // we have to average the absolute weights across
242  // estimators in order to return one weight for each feature.
243  for( int i = 0 ; i < weights.size(); ++i )
244  {
245  for( int j = 0;j<weights.at(i).size(); ++j)
246  {
247  w.at(j) += fabs(weights.at(i)[j]);
248  }
249  }
250  // normalize by the number of classes
251  for( int i = 0; i < w.size() ; i++)
252  w.at(i) = w.at(i)/weights.size();
253 
254  return vector<float>(w.begin(), w.end());
255  }
256  /* For linear regression and binary classification
257  * models, return the true weights. */
258  else
259  {
260  auto tmp = dynamic_pointer_cast<sh::CLinearMachine>(
261  p_est)->get_w().clone();
262 
263  if (this->normalize && norm_adjust)
264  {
265  this->N.adjust_weights(tmp);
266 
267  }
268  w.assign(tmp.data(), tmp.data()+tmp.size());
269  }
270  }
271  /* For decision trees, return the feature importance scores. */
272  else if (ml_type == CART)
273  w = dynamic_pointer_cast<sh::CMyCARTree>(
274  p_est)->feature_importances();
275  else
276  w = dynamic_pointer_cast<sh::CMyRandomForest>(
277  p_est)->feature_importances();
278 
279  return vector<float>(w.begin(), w.end());
280 }
281 
282 shared_ptr<CLabels> ML::fit(const MatrixXf& X, const VectorXf& y,
283  const Parameters& params, bool& pass,
284  const vector<char>& dtypes)
285 {
299  init(true);
300 
301  MatrixXd _X = X.cast<double>();
302  VectorXd _y = y.cast<double>();
303 
304  if (ml_type == RF)
305  {
306  int max_feats = std::sqrt(X.rows());
307  dynamic_pointer_cast<sh::CMyRandomForest>(
308  p_est)->set_num_random_features(max_feats);
309  }
310 
311  // for tree-based methods we need to specify data types
312  if (ml_type == RF || ml_type == CART)
313  {
314  //std::cout << "setting dtypes\n";
315  if (dtypes.empty())
316  set_dtypes(params.dtypes);
317  else
318  set_dtypes(dtypes);
319  }
320 
321  if (normalize)
322  {
323  /* N.fit_normalize(X, find_dtypes(X)); */
324  if (dtypes.empty())
325  N.fit_normalize(_X, find_dtypes(X));
326  else
327  N.fit_normalize(_X, dtypes);
328  }
329 
330  /* else */
331  /* cout << "normlize is false\n"; */
332 
333 
334 
335  if(_X.isZero(0.0001))
336  {
337 
338  logger.log("Setting labels to zero since features are zero\n",
339  3);
340 
341  shared_ptr<CLabels> labels;
342 
343  switch (this->prob_type)
344  {
345  case PT_REGRESSION :
346  labels = std::shared_ptr<CLabels>(
347  new CRegressionLabels(_y.size()));
348  break;
349  case PT_BINARY :
350  labels = std::shared_ptr<CLabels>(
351  new CBinaryLabels(_y.size()));
352  break;
353  case PT_MULTICLASS :
354  labels = std::shared_ptr<CLabels>(
355  new CMulticlassLabels(_y.size()));
356  break;
357  }
358 
359  pass = false;
360  return labels;
361  }
362 
363 
364  auto features = some<CDenseFeatures<float64_t>>(
365  SGMatrix<float64_t>(_X));
366 
367  // for liblinear L1, we have to transpose the features during training
368  if (ml_type == L1_LR && this->prob_type==PT_BINARY)
369  features = features->get_transposed();
370 
371 
372 
373  if(this->prob_type==PT_BINARY && in({LR, L1_LR, SVM}, ml_type))
374  {
375  // binary classification
376  p_est->set_labels(
377  some<CBinaryLabels>(SGVector<float64_t>(_y), 0.5));
378  }
379  else if (this->prob_type==PT_MULTICLASS)
380  // multiclass classification
381  p_est->set_labels(some<CMulticlassLabels>(
382  SGVector<float64_t>(_y)));
383  else
384  // regression
385  p_est->set_labels(some<CRegressionLabels>(
386  SGVector<float64_t>(_y)));
387 
388  // train ml
389  logger.log("ML training on thread "
390  + std::to_string(omp_get_thread_num()) + "...",3," ");
391  // *** Train the model ***
392  try
393  {
394  p_est->train(features);
395  }
396  catch (...)
397  {
398  logger.log("Shogun failed to train",3);
399  }
400 
401  logger.log("done!",3);
402 
403  // transpose features back
404  if (ml_type == L1_LR && this->prob_type==PT_BINARY)
405  features = features->get_transposed();
406 
407  logger.log("exiting ml::fit",3);
408  auto y_pred = this->retrieve_labels(features, true, pass);
409  features->free_features();
410  return y_pred;
411 }
412 
413 VectorXf ML::fit_vector(const MatrixXf& X, const VectorXf& y,
414  const Parameters& params, bool& pass,
415  const vector<char>& dtypes)
416 {
417  shared_ptr<CLabels> labels = fit(X, y, params, pass, dtypes);
418 
419  return labels_to_vector(labels);
420 }
421 
422 
423 shared_ptr<CLabels> ML::predict(const MatrixXf& X, bool print)
424 {
425  logger.log("ML::predict...",3);
426  shared_ptr<CLabels> labels;
427  logger.log("X size: " + to_string(X.rows()) + "x" + to_string(X.cols()),3);
428  MatrixXd _X = X.template cast<double>();
429  logger.log("cast X to double",3);
430 
431  /* Make sure the model fit() method passed by
432  * looking for empty weights.
433  * If the weights are empty, assign dummy labels. */
434  if (get_weights().empty())
435  {
436  logger.log("weight empty; returning zeros",3);
437  if (this->prob_type==PT_BINARY)
438  {
439  labels = std::shared_ptr<CLabels>(
440  new CBinaryLabels(_X.cols()));
441  for (unsigned i = 0; i < _X.cols() ; ++i)
442  {
443  dynamic_pointer_cast<CBinaryLabels>(
444  labels)->set_value(0,i);
445  dynamic_pointer_cast<CBinaryLabels>(
446  labels)->set_label(i,0);
447  }
448  return labels;
449  }
450  else if (this->prob_type == PT_MULTICLASS)
451  {
452  labels = std::shared_ptr<CLabels>(
453  new CMulticlassLabels(_X.cols()));
454  for (unsigned i = 0; i < _X.cols() ; ++i)
455  {
456  dynamic_pointer_cast<CMulticlassLabels>(
457  labels)->set_value(0,i);
458  dynamic_pointer_cast<CMulticlassLabels>(
459  labels)->set_label(i,0);
460  }
461  return labels;
462  }
463  else
464  {
465  labels = std::shared_ptr<CLabels>(
466  new CRegressionLabels(_X.cols()));
467  for (unsigned i = 0; i < _X.cols() ; ++i)
468  {
469  dynamic_pointer_cast<CRegressionLabels>(
470  labels)->set_value(0,i);
471  dynamic_pointer_cast<CRegressionLabels>(
472  labels)->set_label(i,0);
473  }
474  return labels;
475  }
476  }
477 
478  /* Otherwise, apply normalization and retrieve labels
479  * from the model. */
480  if (normalize)
481  N.normalize(_X);
482 
483  auto features = some<CDenseFeatures<float64_t>>(
484  SGMatrix<float64_t>(_X));
485 
486  bool pass = true;
487  auto y_pred = this->retrieve_labels(features, true, pass);
488  features->free_features();
489  return y_pred;
490 }
491 
492 VectorXf ML::predict_vector(const MatrixXf& X)
493 {
494  shared_ptr<CLabels> labels = predict(X);
495  return labels_to_vector(labels);
496 
497 }
498 
499 ArrayXXf ML::predict_proba(const MatrixXf& X)
500 {
501  shared_ptr<CLabels> labels = shared_ptr<CLabels>(predict(X));
502 
503  if (this->prob_type==PT_BINARY
504  && in({SVM, LR, L1_LR, CART, RF}, ml_type))
505  {
506  shared_ptr<CBinaryLabels> BLabels = \
507  dynamic_pointer_cast<CBinaryLabels>(labels);
508  /* BLabels->scores_to_probabilities(); */
509  SGVector<double> tmp= BLabels->get_values();
510  ArrayXXd confidences(1,tmp.size());
511  confidences.row(0) = Map<ArrayXd>(tmp.data(),tmp.size());
512  return confidences.template cast<float>();
513  }
514  else if (this->prob_type == PT_MULTICLASS)
515  {
516  shared_ptr<CMulticlassLabels> MLabels = \
517  dynamic_pointer_cast<CMulticlassLabels>(labels);
518 
519  /* AFAIK, the only reliable way to get the number of classes
520  * is to measure the output */
521  int n_classes = MLabels->get_multiclass_confidences(0).size();
522  MatrixXd confidences(n_classes,
523  MLabels->get_num_labels());
524  for (int i =0; i<confidences.cols(); ++i)
525  {
526  SGVector<double> tmp = \
527  MLabels->get_multiclass_confidences(i);
528  confidences.col(i) = Map<ArrayXd>(tmp.data(),tmp.size());
529  }
530 
531  return confidences.template cast<float>();;
532  }
533  else
534  THROW_INVALID_ARGUMENT("Error: predict_proba not defined for "
535  "problem type or ML method");
536  return ArrayXXf();
537 }
538 
539 VectorXf ML::labels_to_vector(const shared_ptr<CLabels>& labels)
540 {
541  SGVector<double> y_pred;
542  if (this->prob_type==PT_BINARY
543  && in({SVM, LR, L1_LR, CART, RF}, ml_type))
544  y_pred = dynamic_pointer_cast<sh::CBinaryLabels>(
545  labels)->get_labels();
546  else if (this->prob_type != PT_REGRESSION)
547  y_pred = dynamic_pointer_cast<sh::CMulticlassLabels>(
548  labels)->get_labels();
549  else
550  y_pred = dynamic_pointer_cast<sh::CRegressionLabels>(
551  labels)->get_labels();
552 
553  Map<VectorXd> yhat(y_pred.data(),y_pred.size());
554 
555  if (this->prob_type==PT_BINARY
556  && in({SVM, LR, L1_LR, CART, RF}, ml_type))
557  // convert -1 to 0
558  yhat = (yhat.cast<int>().array() == -1).select(0,yhat);
559 
560  VectorXf yhatf = yhat.template cast<float>();
561  clean(yhatf);
562  return yhatf;
563 }
564 
565 float ML::get_bias(bool norm_adjust) const
566 {
567  // get bias weight. only works with linear machines
568  if ( in({L1_LR, LR, LARS, Ridge}, ml_type) )
569  {
570  // in the multiclass case, return the average bias
571  if (this->prob_type == sh::PT_MULTICLASS)
572  {
573  auto biases = \
574  dynamic_pointer_cast<sh::CMulticlassLogisticRegression>(p_est)
575  ->get_bias();
576  return accumulate(biases.begin(), biases.end(), 0.0)/biases.size();
577  }
578  else
579  {
580  if (this->normalize && norm_adjust)
581  {
582  float b = dynamic_pointer_cast<sh::CLinearMachine>(p_est)->get_bias();
583  auto tmp = dynamic_pointer_cast<sh::CLinearMachine>(
584  p_est)->get_w().clone();
585  /* auto tmp_map = Map<Eigen::VectorXd>(tmp.data(), */
586  /* tmp.size() */
587  /* ); */
588  return this->N.adjust_offset(tmp, b);
589 
590  }
591  else
592  return dynamic_pointer_cast<sh::CLinearMachine>(p_est)->get_bias();
593  }
594  }
595  else
596  return 0;
597 
598 }
599 
600 void ML::set_bias(float b)
601 {
602  // set bias weight. only works with linear machines
603  if (in({L1_LR, LR, LARS, Ridge}, ml_type)
604  && this->prob_type != sh::PT_MULTICLASS)
605  {
606  return dynamic_pointer_cast<sh::CLinearMachine>(
607  p_est)->set_bias(b);
608  }
609  else
610  THROW_RUNTIME_ERROR("WARNING: Couldn't set bias, "
611  "not a binary linear machine");
612 }
613 
614 shared_ptr<CLabels> ML::retrieve_labels(CDenseFeatures<float64_t>* features,
615  bool proba, bool& pass)
616 {
617  logger.log("ML::get_labels",3);
618  shared_ptr<CLabels> labels;
619  SGVector<double> y_pred;
620 
621 
622  if (this->prob_type==PT_BINARY &&
623  in({LR, L1_LR, SVM, CART, RF}, ml_type))
624  {
625  labels = shared_ptr<CLabels>(
626  p_est->apply_binary(features));
627 
628  if (proba)
629  {
630  if (ml_type == CART)
631  {
632  dynamic_pointer_cast<sh::CMyCARTree>(p_est)->
633  set_probabilities(labels.get(), features);
634  }
635  else if(ml_type == RF)
636  {
637  dynamic_pointer_cast<sh::CMyRandomForest>(p_est)->
638  set_probabilities(labels.get(), features);
639  }
640  else
641  {
642  dynamic_pointer_cast<sh::CMyLibLinear>(p_est)->
643  set_probabilities(labels.get(), features);
644  }
645  }
646  y_pred = dynamic_pointer_cast<sh::CBinaryLabels>(
647  labels)->get_labels();
648 
649  }
650  else if (this->prob_type != PT_REGRESSION)
651  // multiclass classification
652  {
653  labels = shared_ptr<CLabels>(
654  p_est->apply_multiclass(features));
655  y_pred = dynamic_pointer_cast<sh::CMulticlassLabels>(
656  labels)->get_labels();
657  }
658  else
659  // regression
660  {
661  labels = shared_ptr<CLabels>(
662  p_est->apply_regression(features));
663  y_pred = dynamic_pointer_cast<sh::CRegressionLabels>(
664  labels)->get_labels();
665  }
666  // map to Eigen vector to check output
667  Map<VectorXd> yhat(y_pred.data(),y_pred.size());
668 
669  if (isinf(yhat.array()).any() || isnan(yhat.array()).any()
670  || yhat.size()==0)
671  pass = false;
672 
673  return labels;
674 }
675 
676 shared_ptr<CLabels> ML::fit_tune(MatrixXf& X, VectorXf& y,
677  const Parameters& params, bool& pass, const vector<char>& dtypes,
678  bool set_default)
679 {
680  logger.log("tuning C...",2);
681  LongData Z;
682  DataRef d_cv(X, y, Z, params.classification,
683  params.protected_groups);
684  FT::Eval::Scorer S(params.scorer_);
685  // for linear models, tune the regularization strength
686  if (in({LARS, Ridge, L1_LR, LR}, this->ml_type) )
687  {
688  vector<float> Cs;
689  switch (this->ml_type)
690  {
691  case LARS:
692  // in this case C is the max num of non-zero variables
693  Cs.resize(X.rows()-1);
694  iota(Cs.begin(),Cs.end(),1);
695  break;
696  case Ridge:
697  Cs = {1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2};
698  break;
699  default:
700  Cs = {1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3};
701  }
702 
703  float n_splits = 10;
704  MatrixXf losses(Cs.size(),int(n_splits));
705  VectorXf dummy;
706 
707  for (int i = 0; i < n_splits; ++i)
708  {
709  logger.log("split " + to_string(i) + "...",3);
710  d_cv.train_test_split(true, 0.8);
711 
712  for (int j = 0; j< Cs.size(); ++j)
713  {
714  this->C = Cs.at(j);
715  this->fit(d_cv.t->X, d_cv.t->y,
716  params, pass, this->dtypes);
717 
718  losses(j,i) = S.score(d_cv.v->y,
719  this->predict(d_cv.v->X),
720  dummy, params.class_weights);
721  }
722  }
723  // get mean loss for each C
724  VectorXf mean_loss = losses.rowwise().mean();
725  string cv_report = "mean_loss (" + to_string(mean_loss.size())
726  + "): \n" ;
727  for (int i = 0; i < Cs.size(); ++i)
728  cv_report += "C = " + to_string(Cs.at(i)) + ", mean_loss = "
729  + to_string(mean_loss(i)) + "\n";
730  VectorXf::Index min_index;
731  float min_loss = mean_loss.minCoeff(&min_index);
732  float best_C = Cs.at(min_index);
733  cv_report += "best C: " + to_string(best_C) + "\n" ;
734  logger.log(cv_report, 2);
735  // set best C and fit a final model to all data with it
736  this->C = best_C;
737  if (set_default)
738  {
739  C_DEFAULT.at(this->ml_type) = best_C;
740  logger.log("changing C_DEFAULT: "
741  + to_string(C_DEFAULT[ml_type]), 2);
742  }
743  return this->fit(X, y, params, pass, dtypes);
744  }
745  return shared_ptr<CLabels>();
746 }
747 
749 void to_json(json& j, const ML& ml)
750 {
751  j["ml_type"] = ml.ml_type;
752  j["ml_str"] = ml.ml_str;
753  j["prob_type"] = ml.prob_type;
754  j["N"] = ml.N;
755  j["max_train_time"] = ml.max_train_time;
756  j["normalize"] = ml.normalize;
757  j["C"] = ml.C;
758  // if ml is a linear model, store the weights and bias so it can be reproduced
759  // multiclass is handled first.
760  if (in({LARS, Ridge, LR, L1_LR, SVM}, ml.ml_type))
761  {
762  if (ml.prob_type == PT_MULTICLASS
763  && in({LR, L1_LR, SVM}, ml.ml_type) )
764  {
765  vector<SGVector<double>> shogun_weights;
766  vector<double> shogun_biases;
767 
768  if( in({LR, L1_LR}, ml.ml_type))
769  {
770  shogun_weights = dynamic_pointer_cast<
772  shogun_biases = dynamic_pointer_cast<
773  sh::CMulticlassLogisticRegression>(ml.p_est)->get_bias();
774  }
775  else //SVM
776  shogun_weights = \
777  dynamic_pointer_cast<sh::CMyMulticlassLibLinear>(
778  ml.p_est)->get_w();
779 
780  vector<VectorXd> weights;
781  for (int i = 0; i < shogun_weights.size(); ++i)
782  {
783  //TODO: fix this, grab shogun data from underlying array
784  weights.push_back(VectorXd());
785  weights.at(i) = Map<VectorXd>(shogun_weights.at(i).data(),
786  shogun_weights.at(i).size());
787  }
788  j["w"] = weights;
789  j["bias"] = shogun_biases;
790  }
791  else
792  {
793  vector<double> weights;
794  auto tmp = dynamic_pointer_cast<sh::CLinearMachine>(
795  ml.p_est)->get_w().clone();
796 
797  weights.assign(tmp.data(), tmp.data()+tmp.size());
798  j["w"] = weights;
799  j["bias"] = ml.get_bias(false);
800 
801  }
802 
803  }
804  else
805  WARN("this is not a linear model; at the moment,"
806  " it will need to be refit to be used after loading.");
807 }
808 
809 void from_json(const json& j, ML& model)
810 {
811  j.at("ml_type").get_to(model.ml_type);
812  j.at("ml_str").get_to(model.ml_str);
813  j.at("prob_type").get_to(model.prob_type);
814  j.at("N").get_to(model.N);
815  j.at("max_train_time").get_to(model.max_train_time);
816  j.at("normalize").get_to(model.normalize);
817  j.at("C").get_to(model.C);
818 
819  // initialize the underlying shogun ML model
820  model.init(true);
821 
822  // if model is a linear model, set the weights and bias
823  // so it can be reproduced
824 
825  if (in({LARS, Ridge, LR, L1_LR, SVM}, model.ml_type))
826  {
827  if(model.prob_type == PT_MULTICLASS
828  && in({LR, L1_LR, SVM}, model.ml_type) )
829  {
830  // TODO: set biases
831  vector<VectorXd> multi_weights = j.at("w");
832  if( in({LR, L1_LR}, model.ml_type))
833  dynamic_pointer_cast<
835  multi_weights);
836  else //SVM
837  dynamic_pointer_cast<sh::CMyMulticlassLibLinear>(
838  model.p_est)->set_w(multi_weights);
839  }
840  else
841  {
842  VectorXd weights;
843  j.at("w").get_to(weights);
844 
845  dynamic_pointer_cast<sh::CLinearMachine>(
846  model.p_est)->set_w(sh::SGVector<double>(weights).clone());
847  dynamic_pointer_cast<sh::CLinearMachine>(
848  model.p_est)->set_bias(j.at("bias").get<float>());
849  }
850 
851  }
852 }
853 
854 void to_json(json& j, const shared_ptr<ML>& model)
855 {
856  to_json(j, *model);
857 }
858 
859 void from_json(const json& j, shared_ptr<ML>& model)
860 {
861  if (model == 0)
862  model = shared_ptr<ML>(new ML());
863  from_json(j, *model);
864 }
865 
866 } // namespace Model
867 } // namespace FT
Data * t
Definition: data.h:93
Data * v
Definition: data.h:92
void train_test_split(bool shuffle, float split)
splits data into training and validation folds.
Definition: data.cc:362
VectorXf & y
Definition: data.h:46
MatrixXf & X
Definition: data.h:45
scoring class for Feat
Definition: scorer.h:31
float score(const VectorXf &y_true, const shared_ptr< CLabels > &yhat, VectorXf &loss, const vector< float > &w)
Definition: scorer.cc:41
class that specifies the machine learning algorithm to pair with Feat.
Definition: ml.h:80
void init(bool assign_p_est=true)
Definition: ml.cc:71
float C
Definition: ml.h:146
int max_train_time
max seconds allowed for training
Definition: ml.h:143
Normalizer N
normalization
Definition: ml.h:142
float get_bias(bool norm_adjust=true) const
returns bias for linear machines
Definition: ml.cc:565
shared_ptr< sh::CMachine > p_est
pointer to the ML object
Definition: ml.h:137
sh::EProblemType prob_type
type of learning problem; binary, multiclass or regression
Definition: ml.h:140
string ml_str
user specified ML type (string)
Definition: ml.h:139
bool normalize
control whether ML normalizes its input before training
Definition: ml.h:144
ML_TYPE ml_type
user specified ML type
Definition: ml.h:138
string log(string m, int v, string sep="\n") const
print message with verbosity control.
Definition: logger.cc:54
std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > LongData
Definition: data.h:23
#define THROW_RUNTIME_ERROR(err)
Definition: error.h:30
#define WARN(err)
Definition: error.h:33
#define THROW_INVALID_ARGUMENT(err)
Definition: error.h:31
#define omp_get_thread_num()
Definition: init.h:12
@ L1_LR
Definition: ml.h:72
@ LARS
Definition: ml.h:66
@ SVM
Definition: ml.h:69
@ RF
Definition: ml.h:68
@ Ridge
Definition: ml.h:67
@ LR
Definition: ml.h:71
@ CART
Definition: ml.h:70
map< ML_TYPE, float > C_DEFAULT
Definition: ml.cc:14
void from_json(const json &j, shared_ptr< ML > &model)
Definition: ml.cc:859
void to_json(json &j, const shared_ptr< ML > &model)
Definition: ml.cc:854
ArrayXb isinf(const ArrayXf &x)
returns true for elements of x that are infinite
Definition: utils.cc:217
ArrayXb isnan(const ArrayXf &x)
returns true for elements of x that are NaN
Definition: utils.cc:226
vector< char > find_dtypes(const MatrixXf &X)
determines data types of columns of matrix X.
Definition: utils.cc:49
static Logger & logger
Definition: logger.h:46
bool in(const vector< T > v, const T &i)
check if element is in vector.
Definition: utils.h:47
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
void clean(ArrayXf &x)
limits node output to be between MIN_FLT and MAX_FLT
Definition: utils.cc:18
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
holds the hyperparameters for Feat.
Definition: params.h:25
vector< char > dtypes
data types of input parameters
Definition: params.h:55
bool classification
flag to conduct classification rather than
Definition: params.h:32
vector< float > class_weights
weights for each class
Definition: params.h:60
vector< bool > protected_groups
protected attributes in X
Definition: params.h:76
string scorer_
actual loss function used, determined by scorer
Definition: params.h:63