Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
search_space.cpp
Go to the documentation of this file.
1#include "search_space.h"
2#include "../program/program.h" // TODO: dont import this header here
3
4namespace Brush{
5
6
7float calc_initial_weight(const ArrayXf& value, const ArrayXf& y)
8{
9 // OBS: only for terminals!
10
11 // weights are initialized as the slope of the z-score of x and y.
12
13 // If y has different length from X, we get a core dump in this function.
14 // That is why Dataset makes a check for this
15 // TODO: need to make SS (or Datasaet) check for this when loading the data
16
17 vector<char> dtypes = {'f', 'f'};
18
19 MatrixXf data(value.size(), 2);
20
21 data.col(0) << value;
22 data.col(1) << y;
23
24 Normalizer n(true);
25 n.fit_normalize(data, dtypes); // normalize works row-wise
26
27 // In slope function, argument order matters (if not normalized with z-score)
28 // The feature should be the first value, and the true value the second
29 // (it will divide covar(arg1, arg2) by var(arg2)).
30 // Since z-score normalizes so mean=0 and std=1, then order doesnt matter
31 float prob_change = std::abs(slope(data.col(0).array() , // x=variable
32 data.col(1).array() )); // y=target
33
34 // having a minimum feature weight if it was not set to zero
35 if (std::abs(prob_change)<1e-5)
36 prob_change = 1e-5;
37
38 // prob_change will evaluate to nan if variance(x)==0. Features with
39 // zero variance should not be used (as they behave just like a constant).
40 if (std::isnan(prob_change))
41 prob_change = 0.0;
42
43 return prob_change;
44}
45
46
51vector<Node> generate_terminals(const Dataset& d, const bool weights_init)
52{
53 vector<Node> terminals;
54 int i = 0;
55 for ( const auto &[feature_name, value]: d.features )
56 {
57 std::visit(
58 [&](auto&& arg) {
59 using T = std::decay_t<decltype(arg)>;
60 using Scalar = typename T::Scalar;
61 constexpr bool weighted = !std::is_same_v<Scalar, bool>;
62 // if constexpr (T::Scalar == float)
63 auto n = Node(
65 Signature<T()>{},
66 weighted,
67 feature_name
68 );
69
70 float prob_change = 1.0; // default value
71
72 if (d.y.size()>0 && weights_init)
73 {
74 // if the value can be casted to float array, we can calculate slope
75 if (std::holds_alternative<ArrayXb>(value))
76 {
77 ArrayXf tmp = std::get<ArrayXb>(value).template cast<float>();
78 prob_change = calc_initial_weight(tmp, d.y);
79 }
80 else if (std::holds_alternative<ArrayXi>(value))
81 {
82 // for each variable we create a one-vs-all binary variable, then
83 // calculate slope. Final value will be the average of slopes
84
85 auto tmp = std::get<ArrayXi>(value);
86
87 //get number of unique values
88 std::map<float, bool> uniqueMap;
89 for(int i = 0; i < tmp.size(); i++)
90 uniqueMap[(float)tmp(i)] = true;
91
92 ArrayXf slopes = ArrayXf::Ones(uniqueMap.size());
93 int slopesIterator = 0;
94 for (const auto& pair : uniqueMap)
95 {
96 auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast<float>();
97
98 slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y);
99 }
100
101 prob_change = slopes.mean();
102 }
103 else if (std::holds_alternative<ArrayXf>(value))
104 {
105 prob_change = calc_initial_weight(std::get<ArrayXf>(value), d.y);
106 }
107 else
108 {
109 auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name);
111 }
112 }
113
114 n.set_prob_change( prob_change );
115
116 terminals.push_back(n);
117 },
118 value
119 );
120 ++i;
121 };
122
123 // iterate through terminals and take the average of values of same signature
124 auto signature_avg = [terminals, weights_init](DataType ret_type){
125 if (!weights_init)
126 return 1.0f;
127
128 float sum = 0.0;
129 int count = 0;
130
131 for (const auto& n : terminals) {
132 if (n.ret_type == ret_type && n.get_prob_change()>0) {
133 sum += n.get_prob_change();
134 count++;
135 }
136 }
137
138 if (count==0) // no terminals of datatype. return 1.0 for the constant
139 return 1.0f;
140
141 return sum / count;
142 };
143
144 // constants for each type -- floats, integers, boolean. This is useful
145 // to ensure the search space will always have something to fill up open spaces
146 // when building/modifying trees
147 auto cXf = Node(NodeType::Constant, Signature<ArrayXf()>(), true, "constF");
148 cXf.set_prob_change(signature_avg(cXf.ret_type));
149 terminals.push_back(cXf);
150
151 // Reminder: Integers are used to represent categorical variables
152 auto cXi = Node(NodeType::Constant, Signature<ArrayXi()>(), true, "constI");
153 cXi.set_prob_change(signature_avg(cXi.ret_type));
154 terminals.push_back(cXi);
155
156 // Boolean constants are needed to prevent crashes when logical operators (And, Or, Not, Geq, Equals)
157 // appear in programs and need boolean terminals as arguments, even in regression problems
158 auto cXb = Node(NodeType::Constant, Signature<ArrayXb()>(), true, "constB");
159 cXb.set_prob_change(signature_avg(cXb.ret_type));
160 terminals.push_back(cXb);
161
162 // mean label node. Does not need to be part of symbols. works only for classification
163 if (d.classification)
164 {
165 auto meanlabel = Node(NodeType::MeanLabel, Signature<ArrayXb()>(), true, "MeanLabel");
166 meanlabel.set_prob_change(1.0f); // always present in classification problems
167 terminals.push_back(meanlabel);
168 }
169
170 return terminals;
171};
172
173std::unordered_map<std::size_t, std::string> ArgsName;
174
175void SearchSpace::print() const {
176 std::cout << fmt::format("{}\n", *this) << std::flush;
177}
178
179void SearchSpace::init(const Dataset& d, const unordered_map<string,float>& user_ops,
180 bool weights_init)
181{
182 // fmt::print("constructing search space...\n");
183 this->node_map.clear();
184 this->node_map_weights.clear();
185 this->terminal_map.clear();
186 this->terminal_types.clear();
187 this->terminal_weights.clear();
188 this->op_names.clear();
189
190 bool use_all = user_ops.size() == 0;
191 if (use_all)
192 {
193 for (size_t op_index=0; op_index< NodeTypes::Count; op_index++){
194 op_names.push_back(
195 NodeTypeName.at(static_cast<NodeType>(1UL << op_index))
196 );
197 }
198 }
199 else
200 {
201 for (const auto& [op, weight] : user_ops)
202 op_names.push_back(op);
203 }
204
205 // create nodes based on data types
207
208 vector<Node> terminals = generate_terminals(d, weights_init);
209
210 // If it is a classification problem, we need to add the fixed root nodes
211 // (logistic for binary classification, softmax for multiclassification).
212 // Sometimes, the user may not specify these two nodes as candidates when
213 // sampling functions, so we check if they are already in the terminal set, and
214 // we add them with zero prob if they are not. They need to be in the func set
215 // when calling GenerateNodeMap, so the search_space will contain all the hashes
216 // and signatures for them (and they can be used only in program root).
217 // TODO: fix softmax and add it here
218
219 /* fmt::print("generate nodetype\n"); */
220
221 if (d.classification)
222 {
223 std::unordered_map<std::string, float> extended_user_ops=user_ops;
224
225 if (!use_all)
226 {
227 // Brush can start from a population of decision trees. Thess `if`s
228 // below ensures that split nodes will always be in the search space
229 if (extended_user_ops.find("SplitOn") == extended_user_ops.end()){
230 extended_user_ops.insert({"SplitOn", 0.0f});
231 op_names.push_back("SplitOn");
232 }
233 if (extended_user_ops.find("SplitBest") == extended_user_ops.end()){
234 extended_user_ops.insert({"SplitBest", 0.0f});
235 op_names.push_back("SplitBest");
236 }
237
238 // We need some ops in the search space so we can have the logit and offset
239 if (extended_user_ops.find("OffsetSum") == extended_user_ops.end()){
240 extended_user_ops.insert({"OffsetSum", 0.0f});
241 op_names.push_back("OffsetSum");
242 }
243
244 // Convert ArrayXf to std::vector<float> for compatibility with std::set
245 std::vector<float> vec(d.y.data(), d.y.data() + d.y.size());
246 std::set<float> unique_classes(vec.begin(), vec.end());
247
248 if (unique_classes.size()==2 && (extended_user_ops.find("Logistic") == extended_user_ops.end())) {
249 extended_user_ops.insert({"Logistic", 0.0f});
250 op_names.push_back("Logistic");
251 }
252 else if (extended_user_ops.find("Softmax") == extended_user_ops.end()) {
253 extended_user_ops.insert({"Softmax", 0.0f});
254 op_names.push_back("Softmax");
255 }
256 }
257
258 // fmt::print("generate nodetype\n");
259 GenerateNodeMap(extended_user_ops, terminal_types,
260 std::make_index_sequence<NodeTypes::OpCount>());
261 }
262 else
263 {
265 std::make_index_sequence<NodeTypes::OpCount>());
266 }
267
268 // map terminals
269 /* fmt::print("looping through terminals...\n"); */
270 for (const auto& term : terminals)
271 {
272 /* fmt::print("adding {} to search space...\n", term.get_name()); */
273 if (terminal_map.find(term.ret_type) == terminal_map.end())
274 terminal_map[term.ret_type] = vector<Node>();
275
276 /* fmt::print("terminal ret_type: {}\n", DataTypeName[term.ret_type]); */
277 terminal_map[term.ret_type].push_back(term);
278 terminal_weights[term.ret_type].push_back(term.get_prob_change());
279 }
280};
281
282std::optional<tree<Node>> SearchSpace::sample_subtree(Node root, int max_d, int max_size) const
283{
284 // public interface to use PTC2 algorithm
285
286 // PTC is designed to not fail (it will persistently try to find nodes with
287 // sampling functions). In pop initialization, this shoudnt be a problem, but
288 // during evolution, due to dynamic changes in node weights by the learners,
289 // it now may fail. We need to check, before calling it, that it has elements
290 // in search space to sample
291 auto ret_match = node_map.at(root.ret_type);
292
293 vector<float> args_w = get_weights(root.ret_type);
294
295 // at least one operator that matches the weight must have positive probability
296 if (!has_solution_space(args_w.begin(), args_w.end()))
297 return std::nullopt;
298
299 // it will always have a terminal (because we create constants).
300 // TODO: I guess I can remove this line below and it will still work
301 // if ( (terminal_map.find(root.ret_type) == terminal_map.end())
302 // || (!has_solution_space(terminal_weights.at(root.ret_type).begin(),
303 // terminal_weights.at(root.ret_type).end())) )
304 // return std::nullopt;
305
306 auto Tree = tree<Node>();
307 auto spot = Tree.insert(Tree.begin(), root);
308
309 // we should notice the difference between size of a PROGRAM and a TREE.
310 // program count weights in its size, while the TREE structure dont. Wenever
311 // using size of a program/tree, make sure you use the function from the correct class
312 PTC2(Tree, spot, max_d, max_size, false);
313
314 return Tree;
315};
316
317tree<Node>& SearchSpace::PTC2(tree<Node>& Tree,
318 tree<Node>::iterator spot, int max_d, int max_size, bool start_from_decision_trees) const
319{
320 // PTC2 is agnostic of program type
321
322 // A comment about PTC2 method:
323 // PTC2 can work with depth or size restriction, but it does not strictly
324 // satisfies these conditions all time. Given a `max_size` and `max_depth`
325 // parameters, the real maximum size that can occur is `max_size` plus the
326 // highest operator arity, and the real maximum depth is `max_depth` plus one.
327
328 // Queue of nodes that need children
329 vector<tuple<TreeIter, DataType, int>> queue;
330
331 // node depth
332 int d = 1;
333 // current tree size
334 int s = 1;
335
336 Node root = spot.node->data;
337
338 // updating size accordingly to root node
340 s += 2;
341 }
342 else if ( root.get_is_weighted()==true
344 s += 2;
345
346 //For each argument position a of n, Enqueue(a; g)
347 for (auto a : root.arg_types)
348 {
349 auto child_spot = Tree.append_child(spot);
350 queue.push_back(make_tuple(child_spot, a, d));
351 }
352
353 int max_arity = 4;
354
355 Node n;
356
357 // Now we actually start the PTC2 procedure to create the program tree.
358 // we are considering that all nodes in the queue will be weighted, so we
359 // multiply the queue size by 3
360 while ( 3*queue.size() + s < max_size && queue.size() > 0)
361 {
362 // including the queue size in the max_size, since each element in queue
363 // can grow up exponentially
364
365 // by default, terminals are weighted (counts as 3 nodes in program size).
366 // since every spot in queue has potential to be a terminal, we multiply
367 // its size by 3. Subtracting one due to the fact that this loop will
368 // always insert a non terminal (which by default has weights off).
369 // this way, we can have PTC2 working properly.
370
371 auto [qspot, t, d] = RandomDequeue(queue);
372
373 if (d >= max_d || s >= max_size)
374 {
375 auto opt = sample_terminal(t);
376
377 // if it returned optional, then there's nothing to sample based on weights.
378 // We'll force sampling again with uniform probs
379 if (!opt)
380 opt = sample_terminal(t, true);
381
382 assert (opt && "PTC2: failed to sample terminal to fill in empty leaves due to max depth or size");
383
384 // If we successfully get a terminal, use it
385 n = opt.value();
386
387 Tree.replace(qspot, n);
388 }
389 else
390 {
391 //choose a nonterminal of matching type
392 std::optional<Node> opt;
393
394 if (start_from_decision_trees) { //
395 // this sample_op is likely to never fail. Only case scenario is
396 // a logistic root (arrayF arg type) with no arrayF arguments.
397 // (in this case, sample_terminal will also fail)
398 opt = sample_op(NodeType::SplitBest, t, true);
399 }
400 else {
401 opt = sample_op(t);
402 }
403
404 if (!opt) { // there is no operator for this node. sample a terminal instead
405 opt = sample_terminal(t);
406
407 // didnt work the easy way, lets try the hard way
408 if (!opt)
409 opt = sample_terminal(t, true);
410 }
411
412 if (!opt) { // no operator or terminal - search space is ill defined
413 auto msg = fmt::format(
414 "Failed to sample operator AND terminal of data type {} during PTC2.\n"
415 "start_from_decision_trees: {}\n",
416 DataTypeName[t],
417 start_from_decision_trees ? "true" : "false"
418 );
419
421
422 // queue.push_back(make_tuple(qspot, t, d));
423 // continue;
424 }
425
426 n = opt.value();
427
428 auto newspot = Tree.replace(qspot, n);
429
430 // For each arg of n, add to queue
431 for (auto a : n.arg_types)
432 {
433 auto child_spot = Tree.append_child(newspot);
434
435 queue.push_back(make_tuple(child_spot, a, d+1));
436 }
437 }
438
439 // increment is different based on node weights
440 ++s;
441
443 // cout << "sampled split on node/n";
444 s += 2;
445 }
446 else if ( n.get_is_weighted()==true
448 s += 2;
449 }
450
451 while (queue.size() > 0)
452 {
453 if (queue.size() == 0)
454 break;
455
456 auto [qspot, t, d] = RandomDequeue(queue);
457
458 auto opt = sample_terminal(t);
459
460 if (!opt)
461 opt = sample_terminal(t, true);
462
463 assert (opt && "PTC2: failed to sample terminal to fill remaining spots in queue");
464
465 n = opt.value();
466
467 auto newspot = Tree.replace(qspot, n);
468 }
469 return Tree;
470};
471
472// TODO: stop using params as a default argument and actually pass it (also update tests)
473RegressorProgram SearchSpace::make_regressor(int max_d, int max_size, const Parameters& params)
474{
475 return make_program<RegressorProgram>(params, max_d, max_size);
476};
477
478ClassifierProgram SearchSpace::make_classifier(int max_d, int max_size, const Parameters& params)
479{
480 return make_program<ClassifierProgram>(params, max_d, max_size);
481};
482
484 int max_d, int max_size, const Parameters& params)
485{
486 return make_program<MulticlassClassifierProgram>(params, max_d, max_size);
487};
488
489RepresenterProgram SearchSpace::make_representer(int max_d, int max_size, const Parameters& params)
490{
491 return make_program<RepresenterProgram>(params, max_d, max_size);
492};
493
494} //Brush
495
holds variable type data.
Definition data.h:51
bool classification
whether this is a classification problem
Definition data.h:85
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:76
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:64
ArrayXf y
length N array, the target label
Definition data.h:82
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition utils.cpp:359
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
NodeType
Definition nodetype.h:31
Program< PT::Representer > RepresenterProgram
Definition types.h:81
auto Isnt(DataType dt) -> bool
Definition node.h:48
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Program< PT::BinaryClassifier > ClassifierProgram
Definition types.h:79
DataType
data types.
Definition types.h:143
auto Is(NodeType nt) -> bool
Definition node.h:313
std::unordered_map< std::size_t, std::string > ArgsName
vector< Node > generate_terminals(const Dataset &d, const bool weights_init)
generate terminals from the dataset features and random constants.
T RandomDequeue(std::vector< T > &Q)
queue for make program
map< DataType, string > DataTypeName
Definition data.cpp:14
std::map< NodeType, std::string > NodeTypeName
Definition nodetype.cpp:81
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
Program< PT::Regressor > RegressorProgram
Definition types.h:78
float calc_initial_weight(const ArrayXf &value, const ArrayXf &y)
Program< PT::MulticlassClassifier > MulticlassClassifierProgram
Definition types.h:80
static constexpr size_t Count
Definition nodetype.h:121
class holding the data for a node in a tree.
Definition node.h:89
std::vector< DataType > arg_types
argument data types
Definition node.h:99
NodeType node_type
the node type
Definition node.h:94
DataType ret_type
return data type
Definition node.h:97
bool get_is_weighted() const
Definition node.h:291
void set_prob_change(float w)
Definition node.h:277
void print() const
prints the search space map.
Map< Node > node_map
Maps return types to argument types to node types.
unordered_map< DataType, vector< Node > > terminal_map
Maps return types to terminals.
void init(const Dataset &d, const unordered_map< string, float > &user_ops={}, bool weights_init=true)
Called by the constructor to initialize the search space.
RegressorProgram make_regressor(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random regressor program. Convenience wrapper for make_program.
unordered_map< DataType, vector< float > > terminal_weights
A map of weights corresponding to elements in terminal_map, used to weight probabilities of each term...
vector< float > get_weights() const
get weights of the return types
vector< string > op_names
A vector storing the available operator names (used by bandits).
tree< Node > & PTC2(tree< Node > &Tree, tree< Node >::iterator root, int max_d, int max_size, bool start_from_decision_trees) const
void GenerateNodeMap(const unordered_map< string, float > &user_ops, const vector< DataType > &unique_data_types, std::index_sequence< Is... >)
std::optional< Node > sample_op(DataType ret) const
get an operator matching return type ret.
RepresenterProgram make_representer(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random representer program. Convenience wrapper for make_program.
Map< float > node_map_weights
A map of weights corresponding to elements in node_map, used to weight probabilities of each node bei...
MulticlassClassifierProgram make_multiclass_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random multiclass classifier program. Convenience wrapper for make_program.
std::optional< tree< Node > > sample_subtree(Node root, int max_d, int max_size) const
create a subtree with maximum size and depth restrictions and root of type root_type
PT make_program(const Parameters &params, int max_d=0, int max_size=0)
Makes a random program.
vector< DataType > terminal_types
A vector storing the available return types of terminals.
bool has_solution_space(Iter start, Iter end) const
Takes iterators to weight vectors and checks if they have a non-empty solution space....
ClassifierProgram make_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random classifier program. Convenience wrapper for make_program.
std::optional< Node > sample_terminal(bool force_return=false) const
Get a random terminal.
normalizes a matrix to unit variance, 0 mean centered.
Definition utils.h:313
void fit_normalize(MatrixXf &X, const vector< char > &dtypes)
Definition utils.cpp:137