Feat C++ API
A feature engineering automation tool
MyCARTree.cc
Go to the documentation of this file.
1 /* Edited by William La Cava 2018 (WGL)
2  *
3  * Copyright (c) The Shogun Machine Learning Toolbox
4  * Written (w) 2014 Parijat Mazumdar
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice, this
11  * list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * The views and conclusions contained in the software and documentation are those
28  * of the authors and should not be interpreted as representing official policies,
29  * either expressed or implied, of the Shogun Development Team.
30  */
31 
32 #include "MyCARTree.h"
33 #include <iostream>
34 #include <shogun/base/some.h>
35 using namespace Eigen;
36 using namespace shogun;
37 using std::vector;
38 using std::cout;
39 
40 const char* CMyCARTree::get_name() const { return "MyCARTree"; }
41 
42 EProblemType CMyCARTree::get_machine_problem_type() const { return m_mode; }
43 
44 
45 void CMyCARTree::set_cv_pruning(bool cv_pruning)
46 {
47  m_apply_cv_pruning = cv_pruning;
48 }
49 
50 float64_t CMyCARTree::get_label_epsilon() { return m_label_epsilon; }
51 
52 const float64_t CMyCARTree::MISSING = CMath::MAX_REAL_NUMBER;
53 const float64_t CMyCARTree::EQ_DELTA = 1e-7;
54 const float64_t CMyCARTree::MIN_SPLIT_GAIN = 1e-7;
55 
56 CMyCARTree::CMyCARTree()
57 : CTreeMachine<MyCARTreeNodeData>()
58 {
59  init();
60 }
61 
62 CMyCARTree::CMyCARTree(SGVector<bool> attribute_types, EProblemType prob_type)
63 : CTreeMachine<MyCARTreeNodeData>()
64 {
65  init();
66  set_feature_types(attribute_types);
67  set_machine_problem_type(prob_type);
68 }
69 
70 CMyCARTree::CMyCARTree(SGVector<bool> attribute_types, EProblemType prob_type, int32_t num_folds, bool cv_prune)
71 : CTreeMachine<MyCARTreeNodeData>()
72 {
73  init();
74  set_feature_types(attribute_types);
75  set_machine_problem_type(prob_type);
76  set_num_folds(num_folds);
77  if (cv_prune)
78  set_cv_pruning(cv_prune);
79 }
80 
82 {
83  SG_UNREF(m_alphas);
84 }
85 
86 void CMyCARTree::set_labels(CLabels* lab)
87 {
88  if (lab->get_label_type()==LT_MULTICLASS)
89  set_machine_problem_type(PT_MULTICLASS);
90  else if (lab->get_label_type()==LT_REGRESSION)
91  set_machine_problem_type(PT_REGRESSION);
92  else
93  SG_ERROR("label type supplied is not supported\n")
94 
95  SG_REF(lab);
96  SG_UNREF(m_labels);
97  m_labels=lab;
98 }
99 
101 {
102  m_mode=mode;
103 }
104 
105 bool CMyCARTree::is_label_valid(CLabels* lab) const
106 {
107  if (m_mode==PT_MULTICLASS && lab->get_label_type()==LT_MULTICLASS)
108  return true;
109  else if (m_mode==PT_REGRESSION && lab->get_label_type()==LT_REGRESSION)
110  return true;
111  else
112  return false;
113 }
114 
115 CBinaryLabels* CMyCARTree::apply_binary(CFeatures* data)
116 {
117  REQUIRE(data, "Data required for classification in apply_multiclass\n")
118 
119  // apply multiclass starting from root
120  bnode_t* current=dynamic_cast<bnode_t*>(get_root());
121 
122  REQUIRE(current, "Tree machine not yet trained.\n");
123  CLabels* ret=apply_from_current_node(dynamic_cast<CDenseFeatures<float64_t>*>(data), current, true);
124 
125  SGVector<double> tmp = dynamic_cast<CDenseLabels*>(ret)->get_labels();
126  /* std::cout << "cast to CBinaryLabels()\n"; */
127  CBinaryLabels* retbc = new CBinaryLabels(tmp,0.5);
128  /* cout << "retbc: " << retbc << "\n"; */
129  /* auto tmpb = retbc->get_labels(); */
130  /* cout << "apply_binary::retbc: "; */
131  /* tmpb.display_vector(); */
132  SG_UNREF(ret);
133  SG_UNREF(current);
134  return retbc;
135 }
136 CMulticlassLabels* CMyCARTree::apply_multiclass(CFeatures* data)
137 {
138  REQUIRE(data, "Data required for classification in apply_multiclass\n")
139 
140  // apply multiclass starting from root
141  bnode_t* current=dynamic_cast<bnode_t*>(get_root());
142 
143  REQUIRE(current, "Tree machine not yet trained.\n");
144  CLabels* ret=apply_from_current_node(dynamic_cast<CDenseFeatures<float64_t>*>(data), current);
145 
146  SG_UNREF(current);
147  return dynamic_cast<CMulticlassLabels*>(ret);
148 }
149 
150 CRegressionLabels* CMyCARTree::apply_regression(CFeatures* data)
151 {
152  REQUIRE(data, "Data required for classification in apply_multiclass\n")
153 
154  // apply regression starting from root
155  bnode_t* current=dynamic_cast<bnode_t*>(get_root());
156  CLabels* ret=apply_from_current_node(dynamic_cast<CDenseFeatures<float64_t>*>(data), current);
157 
158  SG_UNREF(current);
159  return dynamic_cast<CRegressionLabels*>(ret);
160 }
161 
162 void CMyCARTree::prune_using_test_dataset(CDenseFeatures<float64_t>* feats, CLabels* gnd_truth, SGVector<float64_t> weights)
163 {
164  if (weights.vlen==0)
165  {
166  weights=SGVector<float64_t>(feats->get_num_vectors());
167  weights.fill_vector(weights.vector,weights.vlen,1);
168  }
169 
170  CDynamicObjectArray* pruned_trees=prune_tree(this);
171 
172  int32_t min_index=0;
173  float64_t min_error=CMath::MAX_REAL_NUMBER;
174  for (int32_t i=0;i<m_alphas->get_num_elements();i++)
175  {
176  CSGObject* element=pruned_trees->get_element(i);
177  bnode_t* root=NULL;
178  if (element!=NULL)
179  root=dynamic_cast<bnode_t*>(element);
180  else
181  SG_ERROR("%d element is NULL\n",i);
182 
183  CLabels* labels=apply_from_current_node(feats, root);
184  float64_t error=compute_error(labels,gnd_truth,weights);
185  if (error<min_error)
186  {
187  min_index=i;
188  min_error=error;
189  }
190 
191  SG_UNREF(labels);
192  SG_UNREF(element);
193  }
194 
195  CSGObject* element=pruned_trees->get_element(min_index);
196  bnode_t* root=NULL;
197  if (element!=NULL)
198  root=dynamic_cast<bnode_t*>(element);
199  else
200  SG_ERROR("%d element is NULL\n",min_index);
201 
202  this->set_root(root);
203 
204  SG_UNREF(pruned_trees);
205  SG_UNREF(element);
206 }
207 
208 void CMyCARTree::set_weights(SGVector<float64_t> w)
209 {
210  m_weights=w;
211  m_weights_set=true;
212 }
213 
214 SGVector<float64_t> CMyCARTree::get_weights() const
215 {
216  return m_weights;
217 }
218 
220 {
221  m_weights=SGVector<float64_t>();
222  m_weights_set=false;
223 }
224 
225 void CMyCARTree::set_feature_types(SGVector<bool> ft)
226 {
227  m_nominal=ft;
228  m_types_set=true;
229 }
230 
231 SGVector<bool> CMyCARTree::get_feature_types() const
232 {
233  return m_nominal;
234 }
235 
237 {
238  m_nominal=SGVector<bool>();
239  m_types_set=false;
240 }
241 
243 {
244  return m_folds;
245 }
246 
247 void CMyCARTree::set_num_folds(int32_t folds)
248 {
249  REQUIRE(folds>1,"Number of folds is expected to be greater than 1. Supplied value is %d\n",folds)
250  m_folds=folds;
251 }
252 
254 {
255  return m_max_depth;
256 }
257 
258 void CMyCARTree::set_max_depth(int32_t depth)
259 {
260  REQUIRE(depth>0,"Max allowed tree depth should be greater than 0. Supplied value is %d\n",depth)
261  m_max_depth=depth;
262 }
263 
265 {
266  return m_min_node_size;
267 }
268 
269 void CMyCARTree::set_min_node_size(int32_t nsize)
270 {
271  REQUIRE(nsize>0,"Min allowed node size should be greater than 0. Supplied value is %d\n",nsize)
272  m_min_node_size=nsize;
273 }
274 
276 {
277  REQUIRE(ep>=0,"Input epsilon value is expected to be greater than or equal to 0\n")
278  m_label_epsilon=ep;
279 }
280 
281 bool CMyCARTree::train_machine(CFeatures* data)
282 {
283  REQUIRE(data,"Data required for training\n")
284  REQUIRE(data->get_feature_class()==C_DENSE,"Dense data required for training\n")
285 
286  int32_t num_features=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_num_features();
287  int32_t num_vectors=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_num_vectors();
288 
289  if (m_weights_set)
290  {
291  REQUIRE(m_weights.vlen==num_vectors,"Length of weights vector (currently %d) should be same as"
292  " number of vectors in data (presently %d)",m_weights.vlen,num_vectors)
293  }
294  else
295  {
296  // all weights are equal to 1
297  m_weights=SGVector<float64_t>(num_vectors);
298  m_weights.fill_vector(m_weights.vector,m_weights.vlen,1.0);
299  }
300 
301  if (m_types_set)
302  {
303  REQUIRE(m_nominal.vlen==num_features,"Length of m_nominal vector (currently %d) should "
304  "be same as number of features in data (presently %d)",m_nominal.vlen,num_features)
305  }
306  else
307  {
308  SG_WARNING("Feature types are not specified. All features are considered as continuous in training")
309  m_nominal=SGVector<bool>(num_features);
310  m_nominal.fill_vector(m_nominal.vector,m_nominal.vlen,false);
311  }
312 
313  set_root(CARTtrain(data,m_weights,m_labels,0));
314 
315  if (m_apply_cv_pruning)
316  {
317  CDenseFeatures<float64_t>* feats=dynamic_cast<CDenseFeatures<float64_t>*>(data);
319  }
320 
321  return true;
322 }
323 
324 void CMyCARTree::set_sorted_features(SGMatrix<float64_t>& sorted_feats, SGMatrix<index_t>& sorted_indices)
325 {
326  m_pre_sort=true;
327  m_sorted_features=sorted_feats;
328  m_sorted_indices=sorted_indices;
329 }
330 
331 void CMyCARTree::pre_sort_features(CFeatures* data, SGMatrix<float64_t>& sorted_feats, SGMatrix<index_t>& sorted_indices)
332 {
333  SGMatrix<float64_t> mat=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_feature_matrix();
334  sorted_feats = SGMatrix<float64_t>(mat.num_cols, mat.num_rows);
335  sorted_indices = SGMatrix<index_t>(mat.num_cols, mat.num_rows);
336  for(int32_t i=0; i<sorted_indices.num_cols; i++)
337  for(int32_t j=0; j<sorted_indices.num_rows; j++)
338  sorted_indices(j,i)=j;
339 
340  Map<MatrixXd> map_sorted_feats(sorted_feats.matrix, mat.num_cols, mat.num_rows);
341  Map<MatrixXd> map_data(mat.matrix, mat.num_rows, mat.num_cols);
342 
343  map_sorted_feats=map_data.transpose();
344 
345  #pragma omp parallel for
346  for(int32_t i=0; i<sorted_feats.num_cols; i++)
347  CMath::qsort_index(sorted_feats.get_column_vector(i), sorted_indices.get_column_vector(i), sorted_feats.num_rows);
348 
349 }
350 
351 CBinaryTreeMachineNode<MyCARTreeNodeData>* CMyCARTree::CARTtrain(CFeatures* data, SGVector<float64_t> weights, CLabels* labels, int32_t level)
352 {
353  REQUIRE(labels,"labels have to be supplied\n");
354  REQUIRE(data,"data matrix has to be supplied\n");
355 
356  bnode_t* node=new bnode_t();
357  SGVector<float64_t> labels_vec=(dynamic_cast<CDenseLabels*>(labels))->get_labels();
358 
359  SGMatrix<float64_t> mat=(dynamic_cast<CDenseFeatures<float64_t>*>(data))->get_feature_matrix();
360 
361  int32_t num_feats=mat.num_rows;
362  int32_t num_vecs=mat.num_cols;
363 
364  // calculate node label
365  switch(m_mode)
366  {
367  case PT_REGRESSION:
368  {
369  float64_t sum=0;
370  for (int32_t i=0;i<labels_vec.vlen;i++)
371  sum+=labels_vec[i]*weights[i];
372 
373  // lsd*total_weight=sum_of_squared_deviation
374  float64_t tot=0;
375  node->data.weight_minus_node=tot*least_squares_deviation(labels_vec,weights,tot);
376  node->data.node_label=sum/tot;
377  node->data.total_weight=tot;
378 
379  break;
380  }
381  case PT_MULTICLASS:
382  {
383  SGVector<float64_t> lab=labels_vec.clone();
384  CMath::qsort(lab);
385  // stores max total weight for a single label
386  int32_t max=weights[0];
387  // stores one of the indices having max total weight
388  int32_t maxi=0;
389  int32_t c=weights[0];
390  for (int32_t i=1;i<lab.vlen;i++)
391  {
392  if (lab[i]==lab[i-1])
393  {
394  c+=weights[i];
395  }
396  else if (c>max)
397  {
398  max=c;
399  maxi=i-1;
400  c=weights[i];
401  }
402  else
403  {
404  c=weights[i];
405  }
406  }
407 
408  if (c>max)
409  {
410  max=c;
411  maxi=lab.vlen-1;
412  }
413 
414  node->data.node_label=lab[maxi];
415 
416  // resubstitution error calculation
417  node->data.total_weight=weights.sum(weights);
418  node->data.weight_minus_node=node->data.total_weight-max;
419  break;
420  }
421  default :
422  SG_ERROR("mode should be either PT_MULTICLASS or PT_REGRESSION\n");
423  }
424 
425  // check stopping rules
426  // case 1 : max tree depth reached if max_depth set
427  if ((m_max_depth>0) && (level==m_max_depth))
428  {
429  node->data.num_leaves=1;
430  node->data.weight_minus_branch=node->data.weight_minus_node;
431  return node;
432  }
433 
434  // case 2 : min node size violated if min_node_size specified
435  if ((m_min_node_size>1) && (labels_vec.vlen<=m_min_node_size))
436  {
437  node->data.num_leaves=1;
438  node->data.weight_minus_branch=node->data.weight_minus_node;
439  return node;
440  }
441 
442  // choose best attribute
443  // transit_into_values for left child
444  SGVector<float64_t> left(num_feats);
445  // transit_into_values for right child
446  SGVector<float64_t> right(num_feats);
447  // final data distribution among children
448  SGVector<bool> left_final(num_vecs);
449  int32_t num_missing_final=0;
450  int32_t c_left=-1;
451  int32_t c_right=-1;
452  int32_t best_attribute;
453  //WGL: adds impurity
454  float64_t IG ;
455  SGVector<index_t> indices(num_vecs);
456  if (m_pre_sort)
457  {
458  CSubsetStack* subset_stack = data->get_subset_stack();
459  if (subset_stack->has_subsets())
460  indices=(subset_stack->get_last_subset())->get_subset_idx();
461  else
462  indices.range_fill();
463  SG_UNREF(subset_stack);
464  best_attribute=compute_best_attribute(m_sorted_features,weights,labels,left,right,left_final,num_missing_final,c_left,c_right,IG,0,indices);
465  }
466  else
467  best_attribute=compute_best_attribute(mat,weights,labels,left,right,left_final,num_missing_final,c_left,c_right,IG);
468 
469  if (best_attribute==-1)
470  {
471  node->data.num_leaves=1;
472  node->data.weight_minus_branch=node->data.weight_minus_node;
473  return node;
474  }
475 
476  SGVector<float64_t> left_transit(c_left);
477  SGVector<float64_t> right_transit(c_right);
478  sg_memcpy(left_transit.vector,left.vector,c_left*sizeof(float64_t));
479  sg_memcpy(right_transit.vector,right.vector,c_right*sizeof(float64_t));
480 
481  if (num_missing_final>0)
482  {
483  SGVector<bool> is_left_final(num_vecs-num_missing_final);
484  int32_t ilf=0;
485  for (int32_t i=0;i<num_vecs;i++)
486  {
487  if (mat(best_attribute,i)!=MISSING)
488  is_left_final[ilf++]=left_final[i];
489  }
490 
491  left_final=surrogate_split(mat,weights,is_left_final,best_attribute);
492  }
493 
494  int32_t count_left=0;
495  for (int32_t c=0;c<num_vecs;c++)
496  count_left=(left_final[c])?count_left+1:count_left;
497 
498  SGVector<index_t> subsetl(count_left);
499  SGVector<float64_t> weightsl(count_left);
500  SGVector<index_t> subsetr(num_vecs-count_left);
501  SGVector<float64_t> weightsr(num_vecs-count_left);
502  index_t l=0;
503  index_t r=0;
504  for (int32_t c=0;c<num_vecs;c++)
505  {
506  if (left_final[c])
507  {
508  subsetl[l]=c;
509  weightsl[l++]=weights[c];
510  }
511  else
512  {
513  subsetr[r]=c;
514  weightsr[r++]=weights[c];
515  }
516  }
517 
518  // left child
519  data->add_subset(subsetl);
520  labels->add_subset(subsetl);
521  bnode_t* left_child=CARTtrain(data,weightsl,labels,level+1);
522  data->remove_subset();
523  labels->remove_subset();
524 
525  // right child
526  data->add_subset(subsetr);
527  labels->add_subset(subsetr);
528  bnode_t* right_child=CARTtrain(data,weightsr,labels,level+1);
529  data->remove_subset();
530  labels->remove_subset();
531 
532  // set node parameters
533  node->data.attribute_id=best_attribute;
534  //WGL: store IG in node
535  node->data.IG = IG;
536  node->left(left_child);
537  node->right(right_child);
538  left_child->data.transit_into_values=left_transit;
539  right_child->data.transit_into_values=right_transit;
540  node->data.num_leaves=left_child->data.num_leaves+right_child->data.num_leaves;
541  node->data.weight_minus_branch=left_child->data.weight_minus_branch+right_child->data.weight_minus_branch;
542 
543  return node;
544 }
545 
546 SGVector<float64_t> CMyCARTree::get_unique_labels(SGVector<float64_t> labels_vec, int32_t &n_ulabels)
547 {
548  float64_t delta=0;
549  if (m_mode==PT_REGRESSION)
550  delta=m_label_epsilon;
551 
552  SGVector<float64_t> ulabels(labels_vec.vlen);
553  SGVector<index_t> sidx=CMath::argsort(labels_vec);
554  ulabels[0]=labels_vec[sidx[0]];
555  n_ulabels=1;
556  int32_t start=0;
557  for (int32_t i=1;i<sidx.vlen;i++)
558  {
559  if (labels_vec[sidx[i]]<=labels_vec[sidx[start]]+delta)
560  continue;
561 
562  start=i;
563  ulabels[n_ulabels]=labels_vec[sidx[i]];
564  n_ulabels++;
565  }
566 
567  return ulabels;
568 }
569 
570 int32_t CMyCARTree::compute_best_attribute(const SGMatrix<float64_t>& mat, const SGVector<float64_t>& weights, CLabels* labels,
571  SGVector<float64_t>& left, SGVector<float64_t>& right, SGVector<bool>& is_left_final, int32_t &num_missing_final, int32_t &count_left,
572  int32_t &count_right,float64_t& IG, int32_t subset_size, const SGVector<index_t>& active_indices) // WGL: added IG argument
573 {
574  SGVector<float64_t> labels_vec=(dynamic_cast<CDenseLabels*>(labels))->get_labels();
575  int32_t num_vecs=labels->get_num_labels();
576  int32_t num_feats;
577  if (m_pre_sort)
578  num_feats=mat.num_cols;
579  else
580  num_feats=mat.num_rows;
581 
582  int32_t n_ulabels;
583  SGVector<float64_t> ulabels=get_unique_labels(labels_vec,n_ulabels);
584 
585  // if all labels same early stop
586  if (n_ulabels==1)
587  return -1;
588 
589  float64_t delta=0;
590  if (m_mode==PT_REGRESSION)
591  delta=m_label_epsilon;
592 
593  SGVector<float64_t> total_wclasses(n_ulabels);
594  total_wclasses.zero();
595 
596  SGVector<int32_t> simple_labels(num_vecs);
597  for (int32_t i=0;i<num_vecs;i++)
598  {
599  for (int32_t j=0;j<n_ulabels;j++)
600  {
601  if (CMath::abs(labels_vec[i]-ulabels[j])<=delta)
602  {
603  simple_labels[i]=j;
604  total_wclasses[j]+=weights[i];
605  break;
606  }
607  }
608  }
609 
610  SGVector<index_t> idx(num_feats);
611  idx.range_fill();
612  if (subset_size)
613  {
614  num_feats=subset_size;
615  CMath::permute(idx);
616  }
617 
618  float64_t max_gain=MIN_SPLIT_GAIN;
619  int32_t best_attribute=-1;
620  float64_t best_threshold=0;
621 
622  SGVector<int64_t> indices_mask;
623  SGVector<int32_t> count_indices(mat.num_rows);
624  count_indices.zero();
625  SGVector<int32_t> dupes(num_vecs);
626  dupes.range_fill();
627  if (m_pre_sort)
628  {
629  indices_mask = SGVector<int64_t>(mat.num_rows);
630  indices_mask.set_const(-1);
631  for(int32_t j=0;j<active_indices.size();j++)
632  {
633  if (indices_mask[active_indices[j]]>=0)
634  dupes[indices_mask[active_indices[j]]]=j;
635 
636  indices_mask[active_indices[j]]=j;
637  count_indices[active_indices[j]]++;
638  }
639  }
640 
641  for (int32_t i=0;i<num_feats;i++)
642  {
643  SGVector<float64_t> feats(num_vecs);
644  SGVector<index_t> sorted_args(num_vecs);
645  SGVector<int32_t> temp_count_indices(count_indices.size());
646  sg_memcpy(temp_count_indices.vector, count_indices.vector, sizeof(int32_t)*count_indices.size());
647 
648  if (m_pre_sort)
649  {
650  SGVector<float64_t> temp_col(mat.get_column_vector(idx[i]), mat.num_rows, false);
651  SGVector<index_t> sorted_indices(m_sorted_indices.get_column_vector(idx[i]), mat.num_rows, false);
652  int32_t count=0;
653  for(int32_t j=0;j<mat.num_rows;j++)
654  {
655  if (indices_mask[sorted_indices[j]]>=0)
656  {
657  int32_t count_index = count_indices[sorted_indices[j]];
658  while(count_index>0)
659  {
660  feats[count]=temp_col[j];
661  sorted_args[count]=indices_mask[sorted_indices[j]];
662  ++count;
663  --count_index;
664  }
665  if (count==num_vecs)
666  break;
667  }
668  }
669  }
670  else
671  {
672  for (int32_t j=0;j<num_vecs;j++)
673  feats[j]=mat(idx[i],j);
674 
675  // O(N*logN)
676  sorted_args.range_fill();
677  CMath::qsort_index(feats.vector, sorted_args.vector, feats.size());
678  }
679  int32_t n_nm_vecs=feats.vlen;
680  // number of non-missing vecs
681  while (feats[n_nm_vecs-1]==MISSING)
682  {
683  total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]-=weights[sorted_args[n_nm_vecs-1]];
684  n_nm_vecs--;
685  }
686 
687  // if only one unique value - it cannot be used to split
688  if (feats[n_nm_vecs-1]<=feats[0]+EQ_DELTA)
689  continue;
690 
691  if (m_nominal[idx[i]])
692  {
693  SGVector<int32_t> simple_feats(num_vecs);
694  simple_feats.fill_vector(simple_feats.vector,simple_feats.vlen,-1);
695 
696  // convert to simple values
697  simple_feats[0]=0;
698  int32_t c=0;
699  for (int32_t j=1;j<n_nm_vecs;j++)
700  {
701  if (feats[j]==feats[j-1])
702  simple_feats[j]=c;
703  else
704  simple_feats[j]=(++c);
705  }
706 
707  SGVector<float64_t> ufeats(c+1);
708  ufeats[0]=feats[0];
709  int32_t u=0;
710  for (int32_t j=1;j<n_nm_vecs;j++)
711  {
712  if (feats[j]==feats[j-1])
713  continue;
714  else
715  ufeats[++u]=feats[j];
716  }
717 
718  // test all 2^(I-1)-1 possible division between two nodes
719  int32_t num_cases=CMath::pow(2,c);
720  for (int32_t k=1;k<num_cases;k++)
721  {
722  SGVector<float64_t> wleft(n_ulabels);
723  SGVector<float64_t> wright(n_ulabels);
724  wleft.zero();
725  wright.zero();
726 
727  // stores which vectors are assigned to left child
728  SGVector<bool> is_left(num_vecs);
729  is_left.fill_vector(is_left.vector,is_left.vlen,false);
730 
731  // stores which among the categorical values of chosen attribute are assigned left child
732  SGVector<bool> feats_left(c+1);
733 
734  // fill feats_left in a unique way corresponding to the case
735  for (int32_t p=0;p<c+1;p++)
736  feats_left[p]=((k/CMath::pow(2,p))%(CMath::pow(2,p+1))==1);
737 
738  // form is_left
739  for (int32_t j=0;j<n_nm_vecs;j++)
740  {
741  is_left[sorted_args[j]]=feats_left[simple_feats[j]];
742  if (is_left[sorted_args[j]])
743  wleft[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
744  else
745  wright[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
746  }
747  for (int32_t j=n_nm_vecs-1;j>=0;j--)
748  {
749  if(dupes[j]!=j)
750  is_left[j]=is_left[dupes[j]];
751  }
752 
753  float64_t g=0;
754  if (m_mode==PT_MULTICLASS)
755  g=gain(wleft,wright,total_wclasses);
756  else if (m_mode==PT_REGRESSION)
757  g=gain(wleft,wright,total_wclasses,ulabels);
758  else
759  SG_ERROR("Undefined problem statement\n");
760 
761  if (g>max_gain)
762  {
763  best_attribute=idx[i];
764  max_gain=g;
765  sg_memcpy(is_left_final.vector,is_left.vector,is_left.vlen*sizeof(bool));
766  num_missing_final=num_vecs-n_nm_vecs;
767 
768  count_left=0;
769  for (int32_t l=0;l<c+1;l++)
770  count_left=(feats_left[l])?count_left+1:count_left;
771 
772  count_right=c+1-count_left;
773 
774  int32_t l=0;
775  int32_t r=0;
776  for (int32_t w=0;w<c+1;w++)
777  {
778  if (feats_left[w])
779  left[l++]=ufeats[w];
780  else
781  right[r++]=ufeats[w];
782  }
783  }
784  }
785  }
786  else
787  {
788  // O(N)
789  SGVector<float64_t> right_wclasses=total_wclasses.clone();
790  SGVector<float64_t> left_wclasses(n_ulabels);
791  left_wclasses.zero();
792 
793  // O(N)
794  // find best split for non-nominal attribute - choose threshold (z)
795  float64_t z=feats[0];
796  right_wclasses[simple_labels[sorted_args[0]]]-=weights[sorted_args[0]];
797  left_wclasses[simple_labels[sorted_args[0]]]+=weights[sorted_args[0]];
798  for (int32_t j=1;j<n_nm_vecs;j++)
799  {
800  if (feats[j]<=z+EQ_DELTA)
801  {
802  right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
803  left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
804  continue;
805  }
806  // O(F)
807  float64_t g=0;
808  if (m_mode==PT_MULTICLASS)
809  g=gain(left_wclasses,right_wclasses,total_wclasses);
810  else if (m_mode==PT_REGRESSION)
811  g=gain(left_wclasses,right_wclasses,total_wclasses,ulabels);
812  else
813  SG_ERROR("Undefined problem statement\n");
814 
815  if (g>max_gain)
816  {
817  max_gain=g;
818  best_attribute=idx[i];
819  best_threshold=z;
820  num_missing_final=num_vecs-n_nm_vecs;
821  }
822 
823  z=feats[j];
824  if (feats[n_nm_vecs-1]<=z+EQ_DELTA)
825  break;
826  right_wclasses[simple_labels[sorted_args[j]]]-=weights[sorted_args[j]];
827  left_wclasses[simple_labels[sorted_args[j]]]+=weights[sorted_args[j]];
828  }
829  }
830 
831  // restore total_wclasses
832  while (n_nm_vecs<feats.vlen)
833  {
834  total_wclasses[simple_labels[sorted_args[n_nm_vecs-1]]]+=weights[sorted_args[n_nm_vecs-1]];
835  n_nm_vecs++;
836  }
837  }
838 
839  if (best_attribute==-1)
840  return -1;
841 
842  if (!m_nominal[best_attribute])
843  {
844  left[0]=best_threshold;
845  right[0]=best_threshold;
846  count_left=1;
847  count_right=1;
848  if (m_pre_sort)
849  {
850  SGVector<float64_t> temp_vec(mat.get_column_vector(best_attribute), mat.num_rows, false);
851  SGVector<index_t> sorted_indices(m_sorted_indices.get_column_vector(best_attribute), mat.num_rows, false);
852  int32_t count=0;
853  for(int32_t i=0;i<mat.num_rows;i++)
854  {
855  if (indices_mask[sorted_indices[i]]>=0)
856  {
857  is_left_final[indices_mask[sorted_indices[i]]]=(temp_vec[i]<=best_threshold);
858  ++count;
859  if (count==num_vecs)
860  break;
861  }
862  }
863  for (int32_t i=num_vecs-1;i>=0;i--)
864  {
865  if(dupes[i]!=i)
866  is_left_final[i]=is_left_final[dupes[i]];
867  }
868 
869  }
870  else
871  {
872  for (int32_t i=0;i<num_vecs;i++)
873  is_left_final[i]=(mat(best_attribute,i)<=best_threshold);
874  }
875  }
876  // WGL: store IG value
877  IG = max_gain;
878  return best_attribute;
879 }
880 
881 SGVector<bool> CMyCARTree::surrogate_split(SGMatrix<float64_t> m,SGVector<float64_t> weights, SGVector<bool> nm_left, int32_t attr)
882 {
883  // return vector - left/right belongingness
884  SGVector<bool> ret(m.num_cols);
885 
886  // ditribute data with known attributes
887  int32_t l=0;
888  float64_t p_l=0.;
889  float64_t total=0.;
890  // stores indices of vectors with missing attribute
891  CDynamicArray<int32_t>* missing_vecs=new CDynamicArray<int32_t>();
892  // stores lambda values corresponding to missing vectors - initialized all with 0
893  CDynamicArray<float64_t>* association_index=new CDynamicArray<float64_t>();
894  for (int32_t i=0;i<m.num_cols;i++)
895  {
896  if (!CMath::fequals(m(attr,i),MISSING,0))
897  {
898  ret[i]=nm_left[l];
899  total+=weights[i];
900  if (nm_left[l++])
901  p_l+=weights[i];
902  }
903  else
904  {
905  missing_vecs->push_back(i);
906  association_index->push_back(0.);
907  }
908  }
909 
910  // for lambda calculation
911  float64_t p_r=(total-p_l)/total;
912  p_l/=total;
913  float64_t p=CMath::min(p_r,p_l);
914 
915  // for each attribute (X') alternative to best split (X)
916  for (int32_t i=0;i<m.num_rows;i++)
917  {
918  if (i==attr)
919  continue;
920 
921  // find set of vectors with non-missing values for both X and X'
922  CDynamicArray<int32_t>* intersect_vecs=new CDynamicArray<int32_t>();
923  for (int32_t j=0;j<m.num_cols;j++)
924  {
925  if (!(CMath::fequals(m(i,j),MISSING,0) || CMath::fequals(m(attr,j),MISSING,0)))
926  intersect_vecs->push_back(j);
927  }
928 
929  if (intersect_vecs->get_num_elements()==0)
930  {
931  SG_UNREF(intersect_vecs);
932  continue;
933  }
934 
935 
936  if (m_nominal[i])
937  handle_missing_vecs_for_nominal_surrogate(m,missing_vecs,association_index,intersect_vecs,ret,weights,p,i);
938  else
939  handle_missing_vecs_for_continuous_surrogate(m,missing_vecs,association_index,intersect_vecs,ret,weights,p,i);
940 
941  SG_UNREF(intersect_vecs);
942  }
943 
944  // if some missing attribute vectors are yet not addressed, use majority rule
945  for (int32_t i=0;i<association_index->get_num_elements();i++)
946  {
947  if (association_index->get_element(i)==0.)
948  ret[missing_vecs->get_element(i)]=(p_l>=p_r);
949  }
950 
951  SG_UNREF(missing_vecs);
952  SG_UNREF(association_index);
953  return ret;
954 }
955 
956 void CMyCARTree::handle_missing_vecs_for_continuous_surrogate(SGMatrix<float64_t> m, CDynamicArray<int32_t>* missing_vecs,
957  CDynamicArray<float64_t>* association_index, CDynamicArray<int32_t>* intersect_vecs, SGVector<bool> is_left,
958  SGVector<float64_t> weights, float64_t p, int32_t attr)
959 {
960  // for lambda calculation - total weight of all vectors in X intersect X'
961  float64_t denom=0.;
962  SGVector<float64_t> feats(intersect_vecs->get_num_elements());
963  for (int32_t j=0;j<intersect_vecs->get_num_elements();j++)
964  {
965  feats[j]=m(attr,intersect_vecs->get_element(j));
966  denom+=weights[intersect_vecs->get_element(j)];
967  }
968 
969  // unique feature values for X'
970  int32_t num_unique=feats.unique(feats.vector,feats.vlen);
971 
972 
973  // all possible splits for chosen attribute
974  for (int32_t j=0;j<num_unique-1;j++)
975  {
976  float64_t z=feats[j];
977  float64_t numer=0.;
978  float64_t numerc=0.;
979  for (int32_t k=0;k<intersect_vecs->get_num_elements();k++)
980  {
981  // if both go left or both go right
982  if ((m(attr,intersect_vecs->get_element(k))<=z) && is_left[intersect_vecs->get_element(k)])
983  numer+=weights[intersect_vecs->get_element(k)];
984  else if ((m(attr,intersect_vecs->get_element(k))>z) && !is_left[intersect_vecs->get_element(k)])
985  numer+=weights[intersect_vecs->get_element(k)];
986  // complementary split cases - one goes left other right
987  else if ((m(attr,intersect_vecs->get_element(k))<=z) && !is_left[intersect_vecs->get_element(k)])
988  numerc+=weights[intersect_vecs->get_element(k)];
989  else if ((m(attr,intersect_vecs->get_element(k))>z) && is_left[intersect_vecs->get_element(k)])
990  numerc+=weights[intersect_vecs->get_element(k)];
991  }
992 
993  float64_t lambda=0.;
994  if (numer>=numerc)
995  lambda=(p-(1-numer/denom))/p;
996  else
997  lambda=(p-(1-numerc/denom))/p;
998  for (int32_t k=0;k<missing_vecs->get_num_elements();k++)
999  {
1000  if ((lambda>association_index->get_element(k)) &&
1001  (!CMath::fequals(m(attr,missing_vecs->get_element(k)),MISSING,0)))
1002  {
1003  association_index->set_element(lambda,k);
1004  if (numer>=numerc)
1005  is_left[missing_vecs->get_element(k)]=(m(attr,missing_vecs->get_element(k))<=z);
1006  else
1007  is_left[missing_vecs->get_element(k)]=(m(attr,missing_vecs->get_element(k))>z);
1008  }
1009  }
1010  }
1011 }
1012 
1013 void CMyCARTree::handle_missing_vecs_for_nominal_surrogate(SGMatrix<float64_t> m, CDynamicArray<int32_t>* missing_vecs,
1014  CDynamicArray<float64_t>* association_index, CDynamicArray<int32_t>* intersect_vecs, SGVector<bool> is_left,
1015  SGVector<float64_t> weights, float64_t p, int32_t attr)
1016 {
1017  // for lambda calculation - total weight of all vectors in X intersect X'
1018  float64_t denom=0.;
1019  SGVector<float64_t> feats(intersect_vecs->get_num_elements());
1020  for (int32_t j=0;j<intersect_vecs->get_num_elements();j++)
1021  {
1022  feats[j]=m(attr,intersect_vecs->get_element(j));
1023  denom+=weights[intersect_vecs->get_element(j)];
1024  }
1025 
1026  // unique feature values for X'
1027  int32_t num_unique=feats.unique(feats.vector,feats.vlen);
1028 
1029  // scan all splits for chosen alternative attribute X'
1030  int32_t num_cases=CMath::pow(2,(num_unique-1));
1031  for (int32_t j=1;j<num_cases;j++)
1032  {
1033  SGVector<bool> feats_left(num_unique);
1034  for (int32_t k=0;k<num_unique;k++)
1035  feats_left[k]=((j/CMath::pow(2,k))%(CMath::pow(2,k+1))==1);
1036 
1037  SGVector<bool> intersect_vecs_left(intersect_vecs->get_num_elements());
1038  for (int32_t k=0;k<intersect_vecs->get_num_elements();k++)
1039  {
1040  for (int32_t q=0;q<num_unique;q++)
1041  {
1042  if (feats[q]==m(attr,intersect_vecs->get_element(k)))
1043  {
1044  intersect_vecs_left[k]=feats_left[q];
1045  break;
1046  }
1047  }
1048  }
1049 
1050  float64_t numer=0.;
1051  float64_t numerc=0.;
1052  for (int32_t k=0;k<intersect_vecs->get_num_elements();k++)
1053  {
1054  // if both go left or both go right
1055  if (intersect_vecs_left[k]==is_left[intersect_vecs->get_element(k)])
1056  numer+=weights[intersect_vecs->get_element(k)];
1057  else
1058  numerc+=weights[intersect_vecs->get_element(k)];
1059  }
1060 
1061  // lambda for this split (2 case identical split/complementary split)
1062  float64_t lambda=0.;
1063  if (numer>=numerc)
1064  lambda=(p-(1-numer/denom))/p;
1065  else
1066  lambda=(p-(1-numerc/denom))/p;
1067 
1068  // address missing value vectors not yet addressed or addressed using worse split
1069  for (int32_t k=0;k<missing_vecs->get_num_elements();k++)
1070  {
1071  if ((lambda>association_index->get_element(k)) &&
1072  (!CMath::fequals(m(attr,missing_vecs->get_element(k)),MISSING,0)))
1073  {
1074  association_index->set_element(lambda,k);
1075  // decide left/right based on which feature value the chosen data point has
1076  for (int32_t q=0;q<num_unique;q++)
1077  {
1078  if (feats[q]==m(attr,missing_vecs->get_element(k)))
1079  {
1080  if (numer>=numerc)
1081  is_left[missing_vecs->get_element(k)]=feats_left[q];
1082  else
1083  is_left[missing_vecs->get_element(k)]=!feats_left[q];
1084 
1085  break;
1086  }
1087  }
1088  }
1089  }
1090  }
1091 }
1092 
1093 float64_t CMyCARTree::gain(SGVector<float64_t> wleft, SGVector<float64_t> wright, SGVector<float64_t> wtotal,
1094  SGVector<float64_t> feats)
1095 {
1096  float64_t total_lweight=0;
1097  float64_t total_rweight=0;
1098  float64_t total_weight=0;
1099 
1100  float64_t lsd_n=least_squares_deviation(feats,wtotal,total_weight);
1101  float64_t lsd_l=least_squares_deviation(feats,wleft,total_lweight);
1102  float64_t lsd_r=least_squares_deviation(feats,wright,total_rweight);
1103 
1104  return lsd_n-(lsd_l*(total_lweight/total_weight))-(lsd_r*(total_rweight/total_weight));
1105 }
1106 
1107 float64_t CMyCARTree::gain(const SGVector<float64_t>& wleft, const SGVector<float64_t>& wright, const SGVector<float64_t>& wtotal)
1108 {
1109  float64_t total_lweight=0;
1110  float64_t total_rweight=0;
1111  float64_t total_weight=0;
1112 
1113  float64_t gini_n=gini_impurity_index(wtotal,total_weight);
1114  float64_t gini_l=gini_impurity_index(wleft,total_lweight);
1115  float64_t gini_r=gini_impurity_index(wright,total_rweight);
1116  return gini_n-(gini_l*(total_lweight/total_weight))-(gini_r*(total_rweight/total_weight));
1117 }
1118 
1119 float64_t CMyCARTree::gini_impurity_index(const SGVector<float64_t>& weighted_lab_classes, float64_t &total_weight)
1120 {
1121  Map<VectorXd> map_weighted_lab_classes(weighted_lab_classes.vector, weighted_lab_classes.size());
1122  total_weight=map_weighted_lab_classes.sum();
1123  float64_t gini=map_weighted_lab_classes.dot(map_weighted_lab_classes);
1124 
1125  gini=1.0-(gini/(total_weight*total_weight));
1126  return gini;
1127 }
1128 
1129 float64_t CMyCARTree::least_squares_deviation(const SGVector<float64_t>& feats, const SGVector<float64_t>& weights, float64_t &total_weight)
1130 {
1131 
1132  Map<VectorXd> map_weights(weights.vector, weights.size());
1133  Map<VectorXd> map_feats(feats.vector, weights.size());
1134  float64_t mean=map_weights.dot(map_feats);
1135  total_weight=map_weights.sum();
1136 
1137  mean/=total_weight;
1138  float64_t dev=0;
1139  for (int32_t i=0;i<weights.vlen;i++)
1140  dev+=weights[i]*(feats[i]-mean)*(feats[i]-mean);
1141 
1142  return dev/total_weight;
1143 }
1144 CLabels* CMyCARTree::apply_from_current_node(CDenseFeatures<float64_t>* feats, bnode_t* current,
1145  bool set_certainty)
1146 {
1147  int32_t num_vecs=feats->get_num_vectors();
1148  REQUIRE(num_vecs>0, "No data provided in apply\n");
1149 
1150  if (set_certainty)
1151  m_certainty=SGVector<float64_t>(num_vecs);
1152 
1153  SGVector<float64_t> labels(num_vecs);
1154  for (int32_t i=0;i<num_vecs;i++)
1155  {
1156  SGVector<float64_t> sample=feats->get_feature_vector(i);
1157  bnode_t* node=current;
1158  SG_REF(node);
1159 
1160  // until leaf is reached
1161  while(node->data.num_leaves!=1)
1162  {
1163  bnode_t* leftchild=node->left();
1164 
1165  if (m_nominal[node->data.attribute_id])
1166  {
1167  SGVector<float64_t> comp=leftchild->data.transit_into_values;
1168  bool flag=false;
1169  for (int32_t k=0;k<comp.vlen;k++)
1170  {
1171  if (comp[k]==sample[node->data.attribute_id])
1172  {
1173  flag=true;
1174  break;
1175  }
1176  }
1177 
1178  if (flag)
1179  {
1180  SG_UNREF(node);
1181  node=leftchild;
1182  SG_REF(leftchild);
1183  }
1184  else
1185  {
1186  SG_UNREF(node);
1187  node=node->right();
1188  }
1189  }
1190  else
1191  {
1192  if (sample[node->data.attribute_id]<=leftchild->data.transit_into_values[0])
1193  {
1194  SG_UNREF(node);
1195  node=leftchild;
1196  SG_REF(leftchild);
1197  }
1198  else
1199  {
1200  SG_UNREF(node);
1201  node=node->right();
1202  }
1203  }
1204 
1205  SG_UNREF(leftchild);
1206  }
1207 
1208  labels[i]=node->data.node_label;
1209 
1210  if (set_certainty) // WGL: set certainty of assignment based on sample weights in leaf
1211  m_certainty[i]=((node->data.total_weight-node->data.weight_minus_node)/
1212  node->data.total_weight);
1213 
1214  SG_UNREF(node);
1215  }
1216 
1217  switch(m_mode)
1218  {
1219  case PT_MULTICLASS:
1220  {
1221  CMulticlassLabels* mlabels=new CMulticlassLabels(labels);
1222  return mlabels;
1223  }
1224 
1225  case PT_REGRESSION:
1226  {
1227  CRegressionLabels* rlabels=new CRegressionLabels(labels);
1228  return rlabels;
1229  }
1230 
1231  default:
1232  SG_ERROR("mode should be either PT_MULTICLASS or PT_REGRESSION\n");
1233  }
1234 
1235  return NULL;
1236 }
1237 // WGL: get the certainty of an output TODO: change to set_probabilities()
1238 SGVector<float64_t> CMyCARTree::get_certainty_vector() const
1239 {
1240  return m_certainty;
1241 }
1242 
1243 void CMyCARTree::prune_by_cross_validation(CDenseFeatures<float64_t>* data, int32_t folds)
1244 {
1245  int32_t num_vecs=data->get_num_vectors();
1246 
1247  // divide data into V folds randomly
1248  SGVector<int32_t> subid(num_vecs);
1249  subid.random_vector(subid.vector,subid.vlen,0,folds-1);
1250 
1251  // for each fold subset
1252  CDynamicArray<float64_t>* r_cv=new CDynamicArray<float64_t>();
1253  CDynamicArray<float64_t>* alphak=new CDynamicArray<float64_t>();
1254  SGVector<int32_t> num_alphak(folds);
1255  for (int32_t i=0;i<folds;i++)
1256  {
1257  // for chosen fold, create subset for training parameters
1258  CDynamicArray<int32_t>* test_indices=new CDynamicArray<int32_t>();
1259  CDynamicArray<int32_t>* train_indices=new CDynamicArray<int32_t>();
1260  for (int32_t j=0;j<num_vecs;j++)
1261  {
1262  if (subid[j]==i)
1263  test_indices->push_back(j);
1264  else
1265  train_indices->push_back(j);
1266  }
1267 
1268  if (test_indices->get_num_elements()==0 || train_indices->get_num_elements()==0)
1269  {
1270  SG_ERROR("Unfortunately you have reached the very low probability event where atleast one of "
1271  "the subsets in cross-validation is not represented at all. Please re-run.")
1272  }
1273 
1274  SGVector<int32_t> subset(train_indices->get_array(),train_indices->get_num_elements(),false);
1275  data->add_subset(subset);
1276  m_labels->add_subset(subset);
1277  SGVector<float64_t> subset_weights(train_indices->get_num_elements());
1278  for (int32_t j=0;j<train_indices->get_num_elements();j++)
1279  subset_weights[j]=m_weights[train_indices->get_element(j)];
1280 
1281  // train with training subset
1282  bnode_t* root=CARTtrain(data,subset_weights,m_labels,0);
1283 
1284  // prune trained tree
1285  CTreeMachine<MyCARTreeNodeData>* tmax=new CTreeMachine<MyCARTreeNodeData>();
1286  tmax->set_root(root);
1287  CDynamicObjectArray* pruned_trees=prune_tree(tmax);
1288 
1289  data->remove_subset();
1290  m_labels->remove_subset();
1291  subset=SGVector<int32_t>(test_indices->get_array(),test_indices->get_num_elements(),false);
1292  data->add_subset(subset);
1293  m_labels->add_subset(subset);
1294  subset_weights=SGVector<float64_t>(test_indices->get_num_elements());
1295  for (int32_t j=0;j<test_indices->get_num_elements();j++)
1296  subset_weights[j]=m_weights[test_indices->get_element(j)];
1297 
1298  // calculate R_CV values for each alpha_k using test subset and store them
1299  num_alphak[i]=m_alphas->get_num_elements();
1300  for (int32_t j=0;j<m_alphas->get_num_elements();j++)
1301  {
1302  alphak->push_back(m_alphas->get_element(j));
1303  CSGObject* jth_element=pruned_trees->get_element(j);
1304  bnode_t* current_root=NULL;
1305  if (jth_element!=NULL)
1306  current_root=dynamic_cast<bnode_t*>(jth_element);
1307  else
1308  SG_ERROR("%d element is NULL which should not be",j);
1309 
1310  CLabels* labels=apply_from_current_node(data, current_root);
1311  float64_t error=compute_error(labels, m_labels, subset_weights);
1312  r_cv->push_back(error);
1313  SG_UNREF(labels);
1314  SG_UNREF(jth_element);
1315  }
1316 
1317  data->remove_subset();
1318  m_labels->remove_subset();
1319  SG_UNREF(train_indices);
1320  SG_UNREF(test_indices);
1321  SG_UNREF(tmax);
1322  SG_UNREF(pruned_trees);
1323  }
1324 
1325  // prune the original T_max
1326  CDynamicObjectArray* pruned_trees=prune_tree(this);
1327 
1328  // find subtree with minimum R_cv
1329  int32_t min_index=-1;
1330  float64_t min_r_cv=CMath::MAX_REAL_NUMBER;
1331  for (int32_t i=0;i<m_alphas->get_num_elements();i++)
1332  {
1333  float64_t alpha=0.;
1334  if (i==m_alphas->get_num_elements()-1)
1335  alpha=m_alphas->get_element(i)+1;
1336  else
1337  alpha=CMath::sqrt(m_alphas->get_element(i)*m_alphas->get_element(i+1));
1338 
1339  float64_t rv=0.;
1340  int32_t base=0;
1341  for (int32_t j=0;j<folds;j++)
1342  {
1343  bool flag=false;
1344  for (int32_t k=base;k<num_alphak[j]+base-1;k++)
1345  {
1346  if (alphak->get_element(k)<=alpha && alphak->get_element(k+1)>alpha)
1347  {
1348  rv+=r_cv->get_element(k);
1349  flag=true;
1350  break;
1351  }
1352  }
1353 
1354  if (!flag)
1355  rv+=r_cv->get_element(num_alphak[j]+base-1);
1356 
1357  base+=num_alphak[j];
1358  }
1359 
1360  if (rv<min_r_cv)
1361  {
1362  min_index=i;
1363  min_r_cv=rv;
1364  }
1365  }
1366 
1367  CSGObject* element=pruned_trees->get_element(min_index);
1368  bnode_t* best_tree_root=NULL;
1369  if (element!=NULL)
1370  best_tree_root=dynamic_cast<bnode_t*>(element);
1371  else
1372  SG_ERROR("%d element is NULL which should not be",min_index);
1373 
1374  this->set_root(best_tree_root);
1375 
1376  SG_UNREF(element);
1377  SG_UNREF(pruned_trees);
1378  SG_UNREF(r_cv);
1379  SG_UNREF(alphak);
1380 }
1381 
1382 float64_t CMyCARTree::compute_error(CLabels* labels, CLabels* reference, SGVector<float64_t> weights)
1383 {
1384  REQUIRE(labels,"input labels cannot be NULL");
1385  REQUIRE(reference,"reference labels cannot be NULL")
1386 
1387  CDenseLabels* gnd_truth=dynamic_cast<CDenseLabels*>(reference);
1388  CDenseLabels* result=dynamic_cast<CDenseLabels*>(labels);
1389 
1390  float64_t denom=weights.sum(weights);
1391  float64_t numer=0.;
1392  switch (m_mode)
1393  {
1394  case PT_MULTICLASS:
1395  {
1396  for (int32_t i=0;i<weights.vlen;i++)
1397  {
1398  if (gnd_truth->get_label(i)!=result->get_label(i))
1399  numer+=weights[i];
1400  }
1401 
1402  return numer/denom;
1403  }
1404 
1405  case PT_REGRESSION:
1406  {
1407  for (int32_t i=0;i<weights.vlen;i++)
1408  numer+=weights[i]*CMath::pow((gnd_truth->get_label(i)-result->get_label(i)),2);
1409 
1410  return numer/denom;
1411  }
1412 
1413  default:
1414  SG_ERROR("Case not possible\n");
1415  }
1416 
1417  return 0.;
1418 }
1419 
1420 CDynamicObjectArray* CMyCARTree::prune_tree(CTreeMachine<MyCARTreeNodeData>* tree)
1421 {
1422  REQUIRE(tree, "Tree not provided for pruning.\n");
1423 
1424  CDynamicObjectArray* trees=new CDynamicObjectArray();
1425  SG_UNREF(m_alphas);
1426  m_alphas=new CDynamicArray<float64_t>();
1427  SG_REF(m_alphas);
1428 
1429  // base tree alpha_k=0
1430  m_alphas->push_back(0);
1431  CTreeMachine<MyCARTreeNodeData>* t1=tree->clone_tree();
1432  SG_REF(t1);
1433  node_t* t1root=t1->get_root();
1434  bnode_t* t1_root=NULL;
1435  if (t1root!=NULL)
1436  t1_root=dynamic_cast<bnode_t*>(t1root);
1437  else
1438  SG_ERROR("t1_root is NULL. This is not expected\n")
1439 
1440  form_t1(t1_root);
1441  trees->push_back(t1_root);
1442  while(t1_root->data.num_leaves>1)
1443  {
1444  CTreeMachine<MyCARTreeNodeData>* t2=t1->clone_tree();
1445  SG_REF(t2);
1446 
1447  node_t* t2root=t2->get_root();
1448  bnode_t* t2_root=NULL;
1449  if (t2root!=NULL)
1450  t2_root=dynamic_cast<bnode_t*>(t2root);
1451  else
1452  SG_ERROR("t1_root is NULL. This is not expected\n")
1453 
1454  float64_t a_k=find_weakest_alpha(t2_root);
1455  m_alphas->push_back(a_k);
1456  cut_weakest_link(t2_root,a_k);
1457  trees->push_back(t2_root);
1458 
1459  SG_UNREF(t1);
1460  SG_UNREF(t1_root);
1461  t1=t2;
1462  t1_root=t2_root;
1463  }
1464 
1465  SG_UNREF(t1);
1466  SG_UNREF(t1_root);
1467  return trees;
1468 }
1469 
1470 float64_t CMyCARTree::find_weakest_alpha(bnode_t* node)
1471 {
1472  if (node->data.num_leaves!=1)
1473  {
1474  bnode_t* left=node->left();
1475  bnode_t* right=node->right();
1476 
1477  SGVector<float64_t> weak_links(3);
1478  weak_links[0]=find_weakest_alpha(left);
1479  weak_links[1]=find_weakest_alpha(right);
1480  weak_links[2]=(node->data.weight_minus_node-node->data.weight_minus_branch)/node->data.total_weight;
1481  weak_links[2]/=(node->data.num_leaves-1.0);
1482 
1483  SG_UNREF(left);
1484  SG_UNREF(right);
1485  return CMath::min(weak_links.vector,weak_links.vlen);
1486  }
1487 
1488  return CMath::MAX_REAL_NUMBER;
1489 }
1490 
1491 void CMyCARTree::cut_weakest_link(bnode_t* node, float64_t alpha)
1492 {
1493  if (node->data.num_leaves==1)
1494  return;
1495 
1496  float64_t g=(node->data.weight_minus_node-node->data.weight_minus_branch)/node->data.total_weight;
1497  g/=(node->data.num_leaves-1.0);
1498  if (alpha==g)
1499  {
1500  node->data.num_leaves=1;
1501  node->data.weight_minus_branch=node->data.weight_minus_node;
1502  CDynamicObjectArray* children=new CDynamicObjectArray();
1503  node->set_children(children);
1504 
1505  SG_UNREF(children);
1506  }
1507  else
1508  {
1509  bnode_t* left=node->left();
1510  bnode_t* right=node->right();
1511  cut_weakest_link(left,alpha);
1512  cut_weakest_link(right,alpha);
1513  node->data.num_leaves=left->data.num_leaves+right->data.num_leaves;
1514  node->data.weight_minus_branch=left->data.weight_minus_branch+right->data.weight_minus_branch;
1515 
1516  SG_UNREF(left);
1517  SG_UNREF(right);
1518  }
1519 }
1520 
1521 void CMyCARTree::form_t1(bnode_t* node)
1522 {
1523  if (node->data.num_leaves!=1)
1524  {
1525  bnode_t* left=node->left();
1526  bnode_t* right=node->right();
1527 
1528  form_t1(left);
1529  form_t1(right);
1530 
1531  node->data.num_leaves=left->data.num_leaves+right->data.num_leaves;
1532  node->data.weight_minus_branch=left->data.weight_minus_branch+right->data.weight_minus_branch;
1533  if (node->data.weight_minus_node==node->data.weight_minus_branch)
1534  {
1535  node->data.num_leaves=1;
1536  CDynamicObjectArray* children=new CDynamicObjectArray();
1537  node->set_children(children);
1538 
1539  SG_UNREF(children);
1540  }
1541 
1542  SG_UNREF(left);
1543  SG_UNREF(right);
1544  }
1545 }
1546 
1548 {
1549  m_nominal=SGVector<bool>();
1550  m_weights=SGVector<float64_t>();
1551  m_mode=PT_MULTICLASS;
1552  m_pre_sort=false;
1553  m_types_set=false;
1554  m_weights_set=false;
1555  m_apply_cv_pruning=false;
1556  m_folds=5;
1557  m_alphas=new CDynamicArray<float64_t>();
1558  SG_REF(m_alphas);
1559  m_max_depth=0;
1560  m_min_node_size=0;
1561  m_label_epsilon=1e-7;
1562  m_sorted_features=SGMatrix<float64_t>();
1563  m_sorted_indices=SGMatrix<index_t>();
1564 
1565  SG_ADD(&m_pre_sort, "m_pre_sort", "presort", MS_NOT_AVAILABLE);
1566  SG_ADD(&m_sorted_features, "m_sorted_features", "sorted feats", MS_NOT_AVAILABLE);
1567  SG_ADD(&m_sorted_indices, "m_sorted_indices", "sorted indices", MS_NOT_AVAILABLE);
1568  SG_ADD(&m_nominal, "m_nominal", "feature types", MS_NOT_AVAILABLE);
1569  SG_ADD(&m_weights, "m_weights", "weights", MS_NOT_AVAILABLE);
1570  SG_ADD(&m_weights_set, "m_weights_set", "weights set", MS_NOT_AVAILABLE);
1571  SG_ADD(&m_types_set, "m_types_set", "feature types set", MS_NOT_AVAILABLE);
1572  SG_ADD(&m_apply_cv_pruning, "m_apply_cv_pruning", "apply cross validation pruning", MS_NOT_AVAILABLE);
1573  SG_ADD(&m_folds, "m_folds", "number of subsets for cross validation", MS_NOT_AVAILABLE);
1574  SG_ADD(&m_max_depth, "m_max_depth", "max allowed tree depth", MS_NOT_AVAILABLE)
1575  SG_ADD(&m_min_node_size, "m_min_node_size", "min allowed node size", MS_NOT_AVAILABLE)
1576  SG_ADD(&m_label_epsilon, "m_label_epsilon", "epsilon for labels", MS_NOT_AVAILABLE)
1577  SG_ADD((machine_int_t*)&m_mode, "m_mode", "problem type (multiclass or regression)", MS_NOT_AVAILABLE)
1578 }
1579 // WGL: method to return importance scores.
1581 {
1585  // need to get feature sizes
1586 
1587  //cout << "Getting features\n";
1588  SGVector<bool> dt = get_feature_types();
1589  vector<double> importances(dt.size(),0.0); //set to zero for all attributes
1590 
1591  bnode_t* node = dynamic_cast<bnode_t*>(m_root);
1592  get_importance(node, importances);
1593 
1594  return importances;
1595 }
1596 
1597 void CMyCARTree::get_importance(bnode_t* node, vector<double>& importances)
1598 {
1599 
1600  if (node->data.num_leaves!=1)
1601  {
1602  bnode_t* left=node->left();
1603  bnode_t* right=node->right();
1604 
1605 
1606  importances[node->data.attribute_id] += node->data.IG;
1607  get_importance(left,importances);
1608  get_importance(right,importances);
1609 
1610  SG_UNREF(left);
1611  SG_UNREF(right);
1612  }
1613 }
1614 // WGL: updated definitions to allow probabilities to be calculated
1615 void CMyCARTree::set_probabilities(CLabels* labels, CFeatures* data)
1616 {
1617  /* cout << "in set_probabilities\n"; */
1618  /* cout << "labels: " << labels << "\n"; */
1619  int size = labels->get_num_labels();
1620  /* cout << "size: " << size << "\n"; */
1621  if (m_certainty.size() != size)
1622  std::cout << "ERROR: mismatch in size btw m_certainty and labels\n";
1623  // set probabilities using m_certainty
1624  for (int i = 0; i < size; ++i)
1625  {
1626  if (labels->get_value(i) > 0)
1627  labels->set_value(m_certainty[i],i);
1628  else
1629  labels->set_value(1-m_certainty[i],i);
1630  }
1631  /* cout << "set labels to \n"; */
1632  /* for (int i = 0; i < size; ++i) */
1633  /* { */
1634  /* cout << labels->get_value(i) << ", "; */
1635  /* } */
1636  /* cout << "\n"; */
1637 }
int32_t get_min_node_size() const
Definition: MyCARTree.cc:264
CDynamicArray< float64_t > * m_alphas
Definition: MyCARTree.h:484
void set_max_depth(int32_t depth)
Definition: MyCARTree.cc:258
float64_t m_label_epsilon
Definition: MyCARTree.h:451
void set_weights(SGVector< float64_t > w)
Definition: MyCARTree.cc:208
void handle_missing_vecs_for_continuous_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
Definition: MyCARTree.cc:956
std::vector< double > feature_importances()
Definition: MyCARTree.cc:1580
int32_t get_max_depth() const
Definition: MyCARTree.cc:253
SGVector< float64_t > get_weights() const
Definition: MyCARTree.cc:214
virtual int32_t compute_best_attribute(const SGMatrix< float64_t > &mat, const SGVector< float64_t > &weights, CLabels *labels, SGVector< float64_t > &left, SGVector< float64_t > &right, SGVector< bool > &is_left_final, int32_t &num_missing, int32_t &count_left, int32_t &count_right, float64_t &IG, int32_t subset_size=0, const SGVector< int32_t > &active_indices=SGVector< index_t >())
Definition: MyCARTree.cc:570
static const float64_t MISSING
Definition: MyCARTree.h:441
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
Definition: MyCARTree.cc:136
CMyCARTree()
This class implements the Classification And Regression Trees algorithm by Breiman et al for decision...
Definition: MyCARTree.cc:56
SGVector< float64_t > get_unique_labels(SGVector< float64_t > labels_vec, int32_t &n_ulabels)
Definition: MyCARTree.cc:546
void set_machine_problem_type(EProblemType mode)
Definition: MyCARTree.cc:100
virtual CBinaryLabels * apply_binary(CFeatures *data=NULL)
Definition: MyCARTree.cc:115
float64_t gain(SGVector< float64_t > wleft, SGVector< float64_t > wright, SGVector< float64_t > wtotal, SGVector< float64_t > labels)
Definition: MyCARTree.cc:1093
EProblemType m_mode
Definition: MyCARTree.h:481
static const float64_t MIN_SPLIT_GAIN
Definition: MyCARTree.h:444
void form_t1(bnode_t *node)
Definition: MyCARTree.cc:1521
int32_t m_max_depth
Definition: MyCARTree.h:487
void cut_weakest_link(bnode_t *node, float64_t alpha)
Definition: MyCARTree.cc:1491
SGVector< float64_t > m_weights
Definition: MyCARTree.h:457
int32_t m_min_node_size
Definition: MyCARTree.h:490
void set_min_node_size(int32_t nsize)
Definition: MyCARTree.cc:269
virtual CRegressionLabels * apply_regression(CFeatures *data=NULL)
Definition: MyCARTree.cc:150
int32_t get_num_folds() const
Definition: MyCARTree.cc:242
float64_t gini_impurity_index(const SGVector< float64_t > &weighted_lab_classes, float64_t &total_weight)
Definition: MyCARTree.cc:1119
virtual void set_labels(CLabels *lab)
Definition: MyCARTree.cc:86
void prune_by_cross_validation(CDenseFeatures< float64_t > *data, int32_t folds)
Definition: MyCARTree.cc:1243
SGVector< bool > get_feature_types() const
Definition: MyCARTree.cc:231
virtual CBinaryTreeMachineNode< MyCARTreeNodeData > * CARTtrain(CFeatures *data, SGVector< float64_t > weights, CLabels *labels, int32_t level)
Definition: MyCARTree.cc:351
static const float64_t EQ_DELTA
Definition: MyCARTree.h:447
virtual bool is_label_valid(CLabels *lab) const
Definition: MyCARTree.cc:105
float64_t least_squares_deviation(const SGVector< float64_t > &labels, const SGVector< float64_t > &weights, float64_t &total_weight)
Definition: MyCARTree.cc:1129
SGVector< float64_t > m_certainty
Definition: MyCARTree.h:495
CDynamicObjectArray * prune_tree(CTreeMachine< MyCARTreeNodeData > *tree)
Definition: MyCARTree.cc:1420
void handle_missing_vecs_for_nominal_surrogate(SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
Definition: MyCARTree.cc:1013
void set_cv_pruning(bool cv_pruning)
Definition: MyCARTree.cc:45
virtual bool train_machine(CFeatures *data=NULL)
Definition: MyCARTree.cc:281
SGMatrix< float64_t > m_sorted_features
Definition: MyCARTree.h:460
void set_probabilities(CLabels *labels, CFeatures *data=NULL)
Definition: MyCARTree.cc:1615
SGMatrix< index_t > m_sorted_indices
Definition: MyCARTree.h:463
float64_t compute_error(CLabels *labels, CLabels *reference, SGVector< float64_t > weights)
Definition: MyCARTree.cc:1382
void get_importance(bnode_t *node, vector< double > &importances)
Definition: MyCARTree.cc:1597
SGVector< bool > surrogate_split(SGMatrix< float64_t > data, SGVector< float64_t > weights, SGVector< bool > nm_left, int32_t attr)
Definition: MyCARTree.cc:881
float64_t find_weakest_alpha(bnode_t *node)
Definition: MyCARTree.cc:1470
CLabels * apply_from_current_node(CDenseFeatures< float64_t > *feats, bnode_t *current, bool set_certainty=false)
Definition: MyCARTree.cc:1144
void set_feature_types(SGVector< bool > ft)
Definition: MyCARTree.cc:225
void prune_using_test_dataset(CDenseFeatures< float64_t > *feats, CLabels *gnd_truth, SGVector< float64_t > weights=SGVector< float64_t >())
Definition: MyCARTree.cc:162
SGVector< float64_t > get_certainty_vector() const
Definition: MyCARTree.cc:1238
void clear_feature_types()
Definition: MyCARTree.cc:236
void pre_sort_features(CFeatures *data, SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
Definition: MyCARTree.cc:331
virtual ~CMyCARTree()
Definition: MyCARTree.cc:81
void set_sorted_features(SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
Definition: MyCARTree.cc:324
void set_label_epsilon(float64_t epsilon)
Definition: MyCARTree.cc:275
SGVector< bool > m_nominal
Definition: MyCARTree.h:454
void set_num_folds(int32_t folds)
Definition: MyCARTree.cc:247
vector< size_t > argsort(const vector< T > &v, bool ascending=true)
return indices that sort a vector
Definition: utils.h:81
static Rnd & r
Definition: rnd.h:135
int i
Definition: params.cc:552
structure to store data of a node of CART. This can be used as a template type in TreeMachineNode cla...