Feat C++ API
A feature engineering automation tool
shogun::CMyRandomCARTree Class Reference

This class implements randomized CART algorithm used in the tree growing process of candidate trees in Random Forests algorithm. The tree growing process is different from the original CART algorithm because of the input attributes which are considered for each node split. In randomized CART, a few (fixed number) attributes are randomly chosen from all available attributes while deciding the best split. This is unlike the original CART where all available attributes are considered while deciding the best split. More...

#include <MyRandomCARTree.h>

Inheritance diagram for shogun::CMyRandomCARTree:
Collaboration diagram for shogun::CMyRandomCARTree:

Public Member Functions

 CMyRandomCARTree ()
 
virtual ~CMyRandomCARTree ()
 
virtual const char * get_name () const
 
void set_feature_subset_size (index_t size)
 
index_t get_feature_subset_size () const
 
CMyRandomCARTreeclone ()
 
- Public Member Functions inherited from shogun::CMyCARTree
 CMyCARTree ()
 This class implements the Classification And Regression Trees algorithm by Breiman et al for decision tree learning. A CART tree is a binary decision tree that is constructed by splitting a node into two child nodes repeatedly, beginning with the root node that contains the whole dataset.

TREE GROWING PROCESS :
During the tree growing process, we recursively split a node into left child and right child so that the resulting nodes are "purest". We do this until any of the stopping criteria is met. To find the best split, we scan through all possible splits in all predictive attributes. The best split is one that maximises some splitting criterion. For classification tasks, ie. when the dependent attribute is categorical, the Gini index is used. For regression tasks, ie. when the dependent variable is continuous, least squares deviation is used. The algorithm uses two stopping criteria : if node becomes completely "pure", ie. all its members have identical dependent variable, or all of them have identical predictive attributes (independent variables).

. More...
 
 CMyCARTree (SGVector< bool > attribute_types, EProblemType prob_type=PT_MULTICLASS)
 
 CMyCARTree (SGVector< bool > attribute_types, EProblemType prob_type, int32_t num_folds, bool cv_prune)
 
virtual ~CMyCARTree ()
 
virtual void set_labels (CLabels *lab)
 
virtual EProblemType get_machine_problem_type () const
 
void set_machine_problem_type (EProblemType mode)
 
virtual bool is_label_valid (CLabels *lab) const
 
virtual CBinaryLabels * apply_binary (CFeatures *data=NULL)
 
virtual CMulticlassLabels * apply_multiclass (CFeatures *data=NULL)
 
virtual CRegressionLabels * apply_regression (CFeatures *data=NULL)
 
void prune_using_test_dataset (CDenseFeatures< float64_t > *feats, CLabels *gnd_truth, SGVector< float64_t > weights=SGVector< float64_t >())
 
void set_weights (SGVector< float64_t > w)
 
SGVector< float64_t > get_weights () const
 
void clear_weights ()
 
void set_feature_types (SGVector< bool > ft)
 
SGVector< bool > get_feature_types () const
 
void clear_feature_types ()
 
int32_t get_num_folds () const
 
void set_num_folds (int32_t folds)
 
int32_t get_max_depth () const
 
void set_max_depth (int32_t depth)
 
int32_t get_min_node_size () const
 
void set_min_node_size (int32_t nsize)
 
void set_cv_pruning (bool cv_pruning)
 
float64_t get_label_epsilon ()
 
void set_label_epsilon (float64_t epsilon)
 
void pre_sort_features (CFeatures *data, SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
 
void set_sorted_features (SGMatrix< float64_t > &sorted_feats, SGMatrix< index_t > &sorted_indices)
 
std::vector< double > feature_importances ()
 
SGVector< float64_t > get_certainty_vector () const
 
void set_probabilities (CLabels *labels, CFeatures *data=NULL)
 

Protected Member Functions

virtual index_t compute_best_attribute (const SGMatrix< float64_t > &mat, const SGVector< float64_t > &weights, CDenseLabels *labels, SGVector< float64_t > &left, SGVector< float64_t > &right, SGVector< bool > &is_left_final, index_t &num_missing, index_t &count_left, index_t &count_right, float64_t &IG, index_t subset_size=0, const SGVector< index_t > &active_indices=SGVector< index_t >())
 
- Protected Member Functions inherited from shogun::CMyCARTree
virtual bool train_machine (CFeatures *data=NULL)
 
virtual CBinaryTreeMachineNode< MyCARTreeNodeData > * CARTtrain (CFeatures *data, SGVector< float64_t > weights, CLabels *labels, int32_t level)
 
SGVector< float64_t > get_unique_labels (SGVector< float64_t > labels_vec, int32_t &n_ulabels)
 
virtual int32_t compute_best_attribute (const SGMatrix< float64_t > &mat, const SGVector< float64_t > &weights, CLabels *labels, SGVector< float64_t > &left, SGVector< float64_t > &right, SGVector< bool > &is_left_final, int32_t &num_missing, int32_t &count_left, int32_t &count_right, float64_t &IG, int32_t subset_size=0, const SGVector< int32_t > &active_indices=SGVector< index_t >())
 
SGVector< bool > surrogate_split (SGMatrix< float64_t > data, SGVector< float64_t > weights, SGVector< bool > nm_left, int32_t attr)
 
void handle_missing_vecs_for_continuous_surrogate (SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
 
void handle_missing_vecs_for_nominal_surrogate (SGMatrix< float64_t > m, CDynamicArray< int32_t > *missing_vecs, CDynamicArray< float64_t > *association_index, CDynamicArray< int32_t > *intersect_vecs, SGVector< bool > is_left, SGVector< float64_t > weights, float64_t p, int32_t attr)
 
float64_t gain (SGVector< float64_t > wleft, SGVector< float64_t > wright, SGVector< float64_t > wtotal, SGVector< float64_t > labels)
 
float64_t gain (const SGVector< float64_t > &wleft, const SGVector< float64_t > &wright, const SGVector< float64_t > &wtotal)
 
float64_t gini_impurity_index (const SGVector< float64_t > &weighted_lab_classes, float64_t &total_weight)
 
float64_t least_squares_deviation (const SGVector< float64_t > &labels, const SGVector< float64_t > &weights, float64_t &total_weight)
 
CLabels * apply_from_current_node (CDenseFeatures< float64_t > *feats, bnode_t *current, bool set_certainty=false)
 
void prune_by_cross_validation (CDenseFeatures< float64_t > *data, int32_t folds)
 
float64_t compute_error (CLabels *labels, CLabels *reference, SGVector< float64_t > weights)
 
CDynamicObjectArray * prune_tree (CTreeMachine< MyCARTreeNodeData > *tree)
 
float64_t find_weakest_alpha (bnode_t *node)
 
void cut_weakest_link (bnode_t *node, float64_t alpha)
 
void form_t1 (bnode_t *node)
 
void init ()
 
void get_importance (bnode_t *node, vector< double > &importances)
 

Private Member Functions

void init ()
 

Private Attributes

index_t m_randsubset_size
 

Additional Inherited Members

- Static Public Attributes inherited from shogun::CMyCARTree
static const float64_t MISSING = CMath::MAX_REAL_NUMBER
 
static const float64_t MIN_SPLIT_GAIN = 1e-7
 
static const float64_t EQ_DELTA = 1e-7
 
- Protected Attributes inherited from shogun::CMyCARTree
float64_t m_label_epsilon
 
SGVector< bool > m_nominal
 
SGVector< float64_t > m_weights
 
SGMatrix< float64_t > m_sorted_features
 
SGMatrix< index_t > m_sorted_indices
 
bool m_pre_sort
 
bool m_types_set
 
bool m_weights_set
 
bool m_apply_cv_pruning
 
int32_t m_folds
 
EProblemType m_mode
 
CDynamicArray< float64_t > * m_alphas
 
int32_t m_max_depth
 
int32_t m_min_node_size
 
SGVector< float64_t > m_certainty
 

Detailed Description

This class implements randomized CART algorithm used in the tree growing process of candidate trees in Random Forests algorithm. The tree growing process is different from the original CART algorithm because of the input attributes which are considered for each node split. In randomized CART, a few (fixed number) attributes are randomly chosen from all available attributes while deciding the best split. This is unlike the original CART where all available attributes are considered while deciding the best split.

Definition at line 48 of file MyRandomCARTree.h.

Constructor & Destructor Documentation

◆ CMyRandomCARTree()

CMyRandomCARTree::CMyRandomCARTree ( )

constructor

Definition at line 36 of file MyRandomCARTree.cc.

◆ ~CMyRandomCARTree()

CMyRandomCARTree::~CMyRandomCARTree ( )
virtual

destructor

Definition at line 42 of file MyRandomCARTree.cc.

Member Function Documentation

◆ clone()

CMyRandomCARTree * CMyRandomCARTree::clone ( )

Definition at line 71 of file MyRandomCARTree.cc.

◆ compute_best_attribute()

index_t CMyRandomCARTree::compute_best_attribute ( const SGMatrix< float64_t > &  mat,
const SGVector< float64_t > &  weights,
CDenseLabels *  labels,
SGVector< float64_t > &  left,
SGVector< float64_t > &  right,
SGVector< bool > &  is_left_final,
index_t &  num_missing,
index_t &  count_left,
index_t &  count_right,
float64_t &  IG,
index_t  subset_size = 0,
const SGVector< index_t > &  active_indices = SGVector<index_t>() 
)
protectedvirtual

computes best attribute for CARTtrain

Parameters
matdata matrix
weightsdata weights
labels_vecdata labels
leftstores feature values for left transition
rightstores feature values for right transition
is_left_finalstores which feature vectors go to the left child
num_missingnumber of missing attributes
count_leftstores number of feature values for left transition
count_rightstores number of feature values for right transition
Returns
index to the best attribute

Definition at line 52 of file MyRandomCARTree.cc.

◆ get_feature_subset_size()

index_t shogun::CMyRandomCARTree::get_feature_subset_size ( ) const
inline

get number of random features to choose in each node split

Returns
size subset size

Definition at line 72 of file MyRandomCARTree.h.

◆ get_name()

virtual const char* shogun::CMyRandomCARTree::get_name ( ) const
inlinevirtual

get name

Returns
class name CARTree

Reimplemented from shogun::CMyCARTree.

Definition at line 60 of file MyRandomCARTree.h.

◆ init()

void CMyRandomCARTree::init ( )
private

initialize parameters

Definition at line 87 of file MyRandomCARTree.cc.

◆ set_feature_subset_size()

void CMyRandomCARTree::set_feature_subset_size ( index_t  size)

set number of random features to choose in each node split

Parameters
sizesubset size

Definition at line 46 of file MyRandomCARTree.cc.

Member Data Documentation

◆ m_randsubset_size

index_t shogun::CMyRandomCARTree::m_randsubset_size
private

random feature subset size

Definition at line 100 of file MyRandomCARTree.h.


The documentation for this class was generated from the following files: