21 ML::ML(
string ml,
bool norm,
bool classification,
int n_classes)
26 ml_hash[
"Lasso"] =
LARS;
27 ml_hash[
"LinearRidgeRegression"] =
Ridge;
28 ml_hash[
"Ridge"] =
Ridge;
30 ml_hash[
"RandomForest"] =
RF;
31 ml_hash[
"CART"] =
CART;
33 ml_hash[
"L2_LR"] =
LR;
34 ml_hash[
"L1_LR"] =
L1_LR;
39 if ( ml_hash.find(ml) == ml_hash.end() )
45 this->ml_type = ml_hash.at(ml);
47 this->prob_type = PT_REGRESSION;
53 this->prob_type = PT_BINARY;
55 this->prob_type = PT_MULTICLASS;
65 this->normalize =
true;
66 logger.
log(
"Using ML normalization since a linear method was specified",
71 void ML::init(
bool assign_p_est)
79 p_est = make_shared<sh::CLeastAngleRegression>(
true);
80 dynamic_pointer_cast<sh::CLeastAngleRegression>(
81 p_est)->set_max_non_zero(
int(this->C));
83 else if (ml_type ==
Ridge)
86 p_est = make_shared<sh::CLinearRidgeRegression>();
88 auto typed_p_est = dynamic_pointer_cast<sh::CLinearRidgeRegression>(
90 typed_p_est->set_compute_bias(
true);
91 typed_p_est->set_tau(this->C);
93 else if (ml_type ==
RF)
96 p_est = make_shared<sh::CMyRandomForest>();
97 auto typed_p_est = dynamic_pointer_cast<sh::CMyRandomForest>(p_est);
98 typed_p_est->set_machine_problem_type(this->prob_type);
99 typed_p_est->set_num_bags(10);
101 if (this->prob_type != PT_REGRESSION)
103 auto CR = some<sh::CMajorityVote>();
104 typed_p_est->set_combination_rule(CR);
108 auto CR = some<sh::CMeanRule>();
109 typed_p_est->set_combination_rule(CR);
113 else if (ml_type ==
CART)
116 p_est = make_shared<sh::CMyCARTree>();
117 dynamic_pointer_cast<sh::CMyCARTree>(
118 p_est)->set_machine_problem_type(this->prob_type);
119 dynamic_pointer_cast<sh::CMyCARTree>(
120 p_est)->set_max_depth(6);
123 else if (ml_type ==
SVM)
125 if(this->prob_type==PT_BINARY)
127 p_est = make_shared<sh::CMyLibLinear>(
128 sh::L2R_L2LOSS_SVC_DUAL);
129 else if (this->prob_type==PT_MULTICLASS){
131 p_est = make_shared<CMyMulticlassLibLinear>();
132 dynamic_pointer_cast<CMyMulticlassLibLinear>(
133 p_est)->set_prob_heuris(sh::OVA_SOFTMAX);
138 p_est = make_shared<sh::CLibLinearRegression>();
143 assert(this->prob_type!=PT_REGRESSION
144 &&
"LR only works with classification.");
145 if (this->prob_type == PT_BINARY){
146 if (this->ml_type ==
LR)
149 p_est = make_shared<sh::CMyLibLinear>(
155 p_est = make_shared<sh::CMyLibLinear>(sh::L1R_LR);
158 auto typed_p_est = dynamic_pointer_cast<sh::CMyLibLinear>(p_est);
160 typed_p_est->set_bias_enabled(
true);
161 typed_p_est->set_epsilon(0.0001);
162 typed_p_est->set_max_iterations(100);
163 typed_p_est->set_C(this->C,this->C);
168 p_est = make_shared<sh::CMulticlassLogisticRegression>();
171 dynamic_pointer_cast<sh::CMulticlassLogisticRegression>(p_est);
172 typed_p_est->set_prob_heuris(sh::OVA_SOFTMAX);
173 typed_p_est->set_z(this->C);
174 typed_p_est->set_epsilon(0.0001);
175 typed_p_est->set_max_iter(100);
184 p_est->set_max_train_time(max_train_time);
192 void ML::set_dtypes(
const vector<char>& dtypes)
194 if (ml_type ==
CART || ml_type ==
RF)
198 sh::SGVector<bool> dt(dtypes.size());
199 for (
unsigned i = 0;
i< dtypes.size(); ++
i)
200 dt[
i] = dtypes.at(
i) ==
'b';
202 dynamic_pointer_cast<sh::CMyCARTree>(
203 p_est)->set_feature_types(dt);
204 else if (ml_type ==
RF)
205 dynamic_pointer_cast<sh::CMyRandomForest>(
206 p_est)->set_feature_types(dt);
208 this->dtypes = dtypes;
211 vector<float> ML::get_weights(
bool norm_adjust)
const
223 if(this->prob_type == PT_MULTICLASS
226 vector<SGVector<double>> weights;
229 weights = dynamic_pointer_cast<
233 dynamic_pointer_cast<sh::CMyMulticlassLibLinear>(
237 return vector<float>();
239 w = vector<double>(weights.at(0).size());
243 for(
int i = 0 ;
i < weights.size(); ++
i )
245 for(
int j = 0;j<weights.at(
i).size(); ++j)
247 w.at(j) += fabs(weights.at(
i)[j]);
251 for(
int i = 0;
i < w.size() ;
i++)
252 w.at(
i) = w.at(
i)/weights.size();
254 return vector<float>(w.begin(), w.end());
260 auto tmp = dynamic_pointer_cast<sh::CLinearMachine>(
261 p_est)->get_w().clone();
263 if (this->normalize && norm_adjust)
265 this->N.adjust_weights(tmp);
268 w.assign(tmp.data(), tmp.data()+tmp.size());
272 else if (ml_type ==
CART)
273 w = dynamic_pointer_cast<sh::CMyCARTree>(
274 p_est)->feature_importances();
276 w = dynamic_pointer_cast<sh::CMyRandomForest>(
277 p_est)->feature_importances();
279 return vector<float>(w.begin(), w.end());
282 shared_ptr<CLabels> ML::fit(
const MatrixXf& X,
const VectorXf& y,
284 const vector<char>& dtypes)
301 MatrixXd _X = X.cast<
double>();
302 VectorXd _y = y.cast<
double>();
306 int max_feats = std::sqrt(X.rows());
307 dynamic_pointer_cast<sh::CMyRandomForest>(
308 p_est)->set_num_random_features(max_feats);
312 if (ml_type ==
RF || ml_type ==
CART)
316 set_dtypes(params.
dtypes);
327 N.fit_normalize(_X, dtypes);
335 if(_X.isZero(0.0001))
338 logger.
log(
"Setting labels to zero since features are zero\n",
341 shared_ptr<CLabels> labels;
343 switch (this->prob_type)
346 labels = std::shared_ptr<CLabels>(
347 new CRegressionLabels(_y.size()));
350 labels = std::shared_ptr<CLabels>(
351 new CBinaryLabels(_y.size()));
354 labels = std::shared_ptr<CLabels>(
355 new CMulticlassLabels(_y.size()));
364 auto features = some<CDenseFeatures<float64_t>>(
365 SGMatrix<float64_t>(_X));
368 if (ml_type ==
L1_LR && this->prob_type==PT_BINARY)
369 features = features->get_transposed();
373 if(this->prob_type==PT_BINARY &&
in({
LR,
L1_LR,
SVM}, ml_type))
377 some<CBinaryLabels>(SGVector<float64_t>(_y), 0.5));
379 else if (this->prob_type==PT_MULTICLASS)
381 p_est->set_labels(some<CMulticlassLabels>(
382 SGVector<float64_t>(_y)));
385 p_est->set_labels(some<CRegressionLabels>(
386 SGVector<float64_t>(_y)));
394 p_est->train(features);
404 if (ml_type ==
L1_LR && this->prob_type==PT_BINARY)
405 features = features->get_transposed();
408 auto y_pred = this->retrieve_labels(features,
true, pass);
409 features->free_features();
413 VectorXf ML::fit_vector(
const MatrixXf& X,
const VectorXf& y,
415 const vector<char>& dtypes)
417 shared_ptr<CLabels> labels = fit(X, y, params, pass, dtypes);
419 return labels_to_vector(labels);
423 shared_ptr<CLabels> ML::predict(
const MatrixXf& X,
bool print)
426 shared_ptr<CLabels> labels;
428 MatrixXd _X = X.template cast<double>();
434 if (get_weights().empty())
436 logger.
log(
"weight empty; returning zeros",3);
437 if (this->prob_type==PT_BINARY)
439 labels = std::shared_ptr<CLabels>(
440 new CBinaryLabels(_X.cols()));
441 for (
unsigned i = 0;
i < _X.cols() ; ++
i)
443 dynamic_pointer_cast<CBinaryLabels>(
444 labels)->set_value(0,
i);
445 dynamic_pointer_cast<CBinaryLabels>(
446 labels)->set_label(
i,0);
450 else if (this->prob_type == PT_MULTICLASS)
452 labels = std::shared_ptr<CLabels>(
453 new CMulticlassLabels(_X.cols()));
454 for (
unsigned i = 0;
i < _X.cols() ; ++
i)
456 dynamic_pointer_cast<CMulticlassLabels>(
457 labels)->set_value(0,
i);
458 dynamic_pointer_cast<CMulticlassLabels>(
459 labels)->set_label(
i,0);
465 labels = std::shared_ptr<CLabels>(
466 new CRegressionLabels(_X.cols()));
467 for (
unsigned i = 0;
i < _X.cols() ; ++
i)
469 dynamic_pointer_cast<CRegressionLabels>(
470 labels)->set_value(0,
i);
471 dynamic_pointer_cast<CRegressionLabels>(
472 labels)->set_label(
i,0);
483 auto features = some<CDenseFeatures<float64_t>>(
484 SGMatrix<float64_t>(_X));
487 auto y_pred = this->retrieve_labels(features,
true, pass);
488 features->free_features();
492 VectorXf ML::predict_vector(
const MatrixXf& X)
494 shared_ptr<CLabels> labels = predict(X);
495 return labels_to_vector(labels);
499 ArrayXXf ML::predict_proba(
const MatrixXf& X)
501 shared_ptr<CLabels> labels = shared_ptr<CLabels>(predict(X));
503 if (this->prob_type==PT_BINARY
506 shared_ptr<CBinaryLabels> BLabels = \
507 dynamic_pointer_cast<CBinaryLabels>(labels);
509 SGVector<double> tmp= BLabels->get_values();
510 ArrayXXd confidences(1,tmp.size());
511 confidences.row(0) = Map<ArrayXd>(tmp.data(),tmp.size());
512 return confidences.template cast<float>();
514 else if (this->prob_type == PT_MULTICLASS)
516 shared_ptr<CMulticlassLabels> MLabels = \
517 dynamic_pointer_cast<CMulticlassLabels>(labels);
521 int n_classes = MLabels->get_multiclass_confidences(0).size();
522 MatrixXd confidences(n_classes,
523 MLabels->get_num_labels());
524 for (
int i =0;
i<confidences.cols(); ++
i)
526 SGVector<double> tmp = \
527 MLabels->get_multiclass_confidences(
i);
528 confidences.col(
i) = Map<ArrayXd>(tmp.data(),tmp.size());
531 return confidences.template cast<float>();;
535 "problem type or ML method");
539 VectorXf ML::labels_to_vector(
const shared_ptr<CLabels>& labels)
541 SGVector<double> y_pred;
542 if (this->prob_type==PT_BINARY
544 y_pred = dynamic_pointer_cast<sh::CBinaryLabels>(
545 labels)->get_labels();
546 else if (this->prob_type != PT_REGRESSION)
547 y_pred = dynamic_pointer_cast<sh::CMulticlassLabels>(
548 labels)->get_labels();
550 y_pred = dynamic_pointer_cast<sh::CRegressionLabels>(
551 labels)->get_labels();
553 Map<VectorXd> yhat(y_pred.data(),y_pred.size());
555 if (this->prob_type==PT_BINARY
558 yhat = (yhat.cast<
int>().array() == -1).select(0,yhat);
560 VectorXf yhatf = yhat.template cast<float>();
565 float ML::get_bias(
bool norm_adjust)
const
571 if (this->prob_type == sh::PT_MULTICLASS)
574 dynamic_pointer_cast<sh::CMulticlassLogisticRegression>(p_est)
576 return accumulate(biases.begin(), biases.end(), 0.0)/biases.size();
580 if (this->normalize && norm_adjust)
582 float b = dynamic_pointer_cast<sh::CLinearMachine>(p_est)->get_bias();
583 auto tmp = dynamic_pointer_cast<sh::CLinearMachine>(
584 p_est)->get_w().clone();
588 return this->N.adjust_offset(tmp, b);
592 return dynamic_pointer_cast<sh::CLinearMachine>(p_est)->get_bias();
600 void ML::set_bias(
float b)
604 && this->prob_type != sh::PT_MULTICLASS)
606 return dynamic_pointer_cast<sh::CLinearMachine>(
611 "not a binary linear machine");
614 shared_ptr<CLabels> ML::retrieve_labels(CDenseFeatures<float64_t>* features,
615 bool proba,
bool& pass)
618 shared_ptr<CLabels> labels;
619 SGVector<double> y_pred;
622 if (this->prob_type==PT_BINARY &&
625 labels = shared_ptr<CLabels>(
626 p_est->apply_binary(features));
632 dynamic_pointer_cast<sh::CMyCARTree>(p_est)->
633 set_probabilities(labels.get(), features);
635 else if(ml_type ==
RF)
637 dynamic_pointer_cast<sh::CMyRandomForest>(p_est)->
638 set_probabilities(labels.get(), features);
642 dynamic_pointer_cast<sh::CMyLibLinear>(p_est)->
643 set_probabilities(labels.get(), features);
646 y_pred = dynamic_pointer_cast<sh::CBinaryLabels>(
647 labels)->get_labels();
650 else if (this->prob_type != PT_REGRESSION)
653 labels = shared_ptr<CLabels>(
654 p_est->apply_multiclass(features));
655 y_pred = dynamic_pointer_cast<sh::CMulticlassLabels>(
656 labels)->get_labels();
661 labels = shared_ptr<CLabels>(
662 p_est->apply_regression(features));
663 y_pred = dynamic_pointer_cast<sh::CRegressionLabels>(
664 labels)->get_labels();
667 Map<VectorXd> yhat(y_pred.data(),y_pred.size());
669 if (
isinf(yhat.array()).any() ||
isnan(yhat.array()).any()
676 shared_ptr<CLabels> ML::fit_tune(MatrixXf& X, VectorXf& y,
677 const Parameters& params,
bool& pass,
const vector<char>& dtypes,
689 switch (this->ml_type)
693 Cs.resize(X.rows()-1);
694 iota(Cs.begin(),Cs.end(),1);
697 Cs = {1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2};
700 Cs = {1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3};
704 MatrixXf losses(Cs.size(),
int(n_splits));
707 for (
int i = 0;
i < n_splits; ++
i)
712 for (
int j = 0; j< Cs.size(); ++j)
715 this->fit(d_cv.
t->
X, d_cv.
t->
y,
716 params, pass, this->
dtypes);
719 this->predict(d_cv.
v->
X),
724 VectorXf mean_loss = losses.rowwise().mean();
725 string cv_report =
"mean_loss (" +
to_string(mean_loss.size())
727 for (
int i = 0;
i < Cs.size(); ++
i)
728 cv_report +=
"C = " +
to_string(Cs.at(
i)) +
", mean_loss = "
730 VectorXf::Index min_index;
731 float min_loss = mean_loss.minCoeff(&min_index);
732 float best_C = Cs.at(min_index);
733 cv_report +=
"best C: " +
to_string(best_C) +
"\n" ;
743 return this->fit(X, y, params, pass, dtypes);
745 return shared_ptr<CLabels>();
765 vector<SGVector<double>> shogun_weights;
766 vector<double> shogun_biases;
770 shogun_weights = dynamic_pointer_cast<
772 shogun_biases = dynamic_pointer_cast<
777 dynamic_pointer_cast<sh::CMyMulticlassLibLinear>(
780 vector<VectorXd> weights;
781 for (
int i = 0;
i < shogun_weights.size(); ++
i)
784 weights.push_back(VectorXd());
785 weights.at(
i) = Map<VectorXd>(shogun_weights.at(
i).data(),
786 shogun_weights.at(
i).size());
789 j[
"bias"] = shogun_biases;
793 vector<double> weights;
794 auto tmp = dynamic_pointer_cast<sh::CLinearMachine>(
795 ml.
p_est)->get_w().clone();
797 weights.assign(tmp.data(), tmp.data()+tmp.size());
805 WARN(
"this is not a linear model; at the moment,"
806 " it will need to be refit to be used after loading.");
811 j.at(
"ml_type").get_to(model.
ml_type);
812 j.at(
"ml_str").get_to(model.
ml_str);
813 j.at(
"prob_type").get_to(model.
prob_type);
814 j.at(
"N").get_to(model.
N);
816 j.at(
"normalize").get_to(model.
normalize);
817 j.at(
"C").get_to(model.
C);
831 vector<VectorXd> multi_weights = j.at(
"w");
833 dynamic_pointer_cast<
837 dynamic_pointer_cast<sh::CMyMulticlassLibLinear>(
838 model.
p_est)->set_w(multi_weights);
843 j.at(
"w").get_to(weights);
845 dynamic_pointer_cast<sh::CLinearMachine>(
846 model.
p_est)->set_w(sh::SGVector<double>(weights).clone());
847 dynamic_pointer_cast<sh::CLinearMachine>(
848 model.
p_est)->set_bias(j.at(
"bias").get<
float>());
854 void to_json(json& j,
const shared_ptr<ML>& model)
862 model = shared_ptr<ML>(
new ML());
void train_test_split(bool shuffle, float split)
splits data into training and validation folds.
float score(const VectorXf &y_true, const shared_ptr< CLabels > &yhat, VectorXf &loss, const vector< float > &w)
class that specifies the machine learning algorithm to pair with Feat.
void init(bool assign_p_est=true)
int max_train_time
max seconds allowed for training
Normalizer N
normalization
float get_bias(bool norm_adjust=true) const
returns bias for linear machines
shared_ptr< sh::CMachine > p_est
pointer to the ML object
sh::EProblemType prob_type
type of learning problem; binary, multiclass or regression
string ml_str
user specified ML type (string)
bool normalize
control whether ML normalizes its input before training
ML_TYPE ml_type
user specified ML type
string log(string m, int v, string sep="\n") const
print message with verbosity control.
multiclass logistic regression
std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > LongData
#define THROW_RUNTIME_ERROR(err)
#define THROW_INVALID_ARGUMENT(err)
#define omp_get_thread_num()
map< ML_TYPE, float > C_DEFAULT
void from_json(const json &j, shared_ptr< ML > &model)
void to_json(json &j, const shared_ptr< ML > &model)
ArrayXb isinf(const ArrayXf &x)
returns true for elements of x that are infinite
ArrayXb isnan(const ArrayXf &x)
returns true for elements of x that are NaN
vector< char > find_dtypes(const MatrixXf &X)
determines data types of columns of matrix X.
bool in(const vector< T > v, const T &i)
check if element is in vector.
std::string to_string(const T &value)
template function to convert objects to string for logging
void clean(ArrayXf &x)
limits node output to be between MIN_FLT and MAX_FLT
holds the hyperparameters for Feat.
vector< char > dtypes
data types of input parameters
bool classification
flag to conduct classification rather than
vector< float > class_weights
weights for each class
vector< bool > protected_groups
protected attributes in X
string scorer_
actual loss function used, determined by scorer