Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
search_space.cpp
Go to the documentation of this file.
1#include "search_space.h"
2#include "../program/program.h" // TODO: dont import this header here
3
4namespace Brush{
5
6
7float calc_initial_weight(const ArrayXf& value, const ArrayXf& y)
8{
9 // OBS: only for terminals!
10
11 // weights are initialized as the slope of the z-score of x and y.
12
13 // If y has different length from X, we get a core dump in this function.
14 // That is why Dataset makes a check for this
15 // TODO: need to make SS (or Datasaet) check for this when loading the data
16
17 vector<char> dtypes = {'f', 'f'};
18
19 MatrixXf data(value.size(), 2);
20
21 data.col(0) << value;
22 data.col(1) << y;
23
24 Normalizer n(true);
25 n.fit_normalize(data, dtypes); // normalize works row-wise
26
27 // In slope function, argument order matters (if not normalized with z-score)
28 // The feature should be the first value, and the true value the second
29 // (it will divide covar(arg1, arg2) by var(arg2)).
30 // Since z-score normalizes so mean=0 and std=1, then order doesnt matter
31 float prob_change = std::abs(slope(data.col(0).array() , // x=variable
32 data.col(1).array() )); // y=target
33
34 // having a minimum feature weight if it was not set to zero
35 if (std::abs(prob_change)<1e-5)
36 prob_change = 1e-5;
37
38 // prob_change will evaluate to nan if variance(x)==0. Features with
39 // zero variance should not be used (as they behave just like a constant).
40 if (std::isnan(prob_change))
41 prob_change = 0.0;
42
43 return prob_change;
44}
45
46
51vector<Node> generate_terminals(const Dataset& d, const bool weights_init)
52{
53 vector<Node> terminals;
54 int i = 0;
55 for ( const auto &[feature_name, value]: d.features )
56 {
57 std::visit(
58 [&](auto&& arg) {
59 using T = std::decay_t<decltype(arg)>;
60 using Scalar = typename T::Scalar;
61 constexpr bool weighted = !std::is_same_v<Scalar, bool>;
62 // if constexpr (T::Scalar == float)
63 auto n = Node(
65 Signature<T()>{},
66 weighted,
67 feature_name
68 );
69
70 float prob_change = 1.0; // default value
71
72 if (d.y.size()>0 && weights_init)
73 {
74 // if the value can be casted to float array, we can calculate slope
75 if (std::holds_alternative<ArrayXb>(value))
76 {
77 ArrayXf tmp = std::get<ArrayXb>(value).template cast<float>();
78 prob_change = calc_initial_weight(tmp, d.y);
79 }
80 else if (std::holds_alternative<ArrayXi>(value))
81 {
82 // for each variable we create a one-vs-all binary variable, then
83 // calculate slope. Final value will be the average of slopes
84
85 auto tmp = std::get<ArrayXi>(value);
86
87 //get number of unique values
88 std::map<float, bool> uniqueMap;
89 for(int i = 0; i < tmp.size(); i++)
90 uniqueMap[(float)tmp(i)] = true;
91
92 ArrayXf slopes = ArrayXf::Ones(uniqueMap.size());
93 int slopesIterator = 0;
94 for (const auto& pair : uniqueMap)
95 {
96 auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast<float>();
97
98 slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y);
99 }
100
101 prob_change = slopes.mean();
102 }
103 else if (std::holds_alternative<ArrayXf>(value))
104 {
105 prob_change = calc_initial_weight(std::get<ArrayXf>(value), d.y);
106 }
107 else
108 {
109 auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name);
111 }
112 }
113
114 n.set_prob_change( prob_change );
115
116 terminals.push_back(n);
117 },
118 value
119 );
120 ++i;
121 };
122
123 // iterate through terminals and take the average of values of same signature
124 auto signature_avg = [terminals, weights_init](DataType ret_type){
125 if (!weights_init)
126 return 1.0f;
127
128 float sum = 0.0;
129 int count = 0;
130
131 for (const auto& n : terminals) {
132 if (n.ret_type == ret_type && n.get_prob_change()>0) {
133 sum += n.get_prob_change();
134 count++;
135 }
136 }
137
138 if (count==0) // no terminals of datatype. return 1.0 for the constant
139 return 1.0f;
140
141 return sum / count;
142 };
143
144 // constants for each type -- floats, integers, boolean. This is useful
145 // to ensure the search space will always have something to fill up open spaces
146 // when building/modifying trees
147 auto cXf = Node(NodeType::Constant, Signature<ArrayXf()>(), true, "constF");
148 cXf.set_prob_change(signature_avg(cXf.ret_type));
149 terminals.push_back(cXf);
150
151 // Reminder: Integers are used to represent categorical variables
152 auto cXi = Node(NodeType::Constant, Signature<ArrayXi()>(), true, "constI");
153 cXi.set_prob_change(signature_avg(cXi.ret_type));
154 terminals.push_back(cXi);
155
156 // Constants for booleans are rarely useful. Our special terminal `MeanLabel` provides a better way of dealing with that
157 // auto cXb = Node(NodeType::Constant, Signature<ArrayXb()>(), false, "constB");
158 // cXb.set_prob_change(signature_avg(cXb.ret_type));
159 // terminals.push_back(cXb);
160
161 // mean label node. Does not need to be part of symbols. works only for classification
162 if (d.classification)
163 {
164 auto meanlabel = Node(NodeType::MeanLabel, Signature<ArrayXb()>(), true, "MeanLabel");
165 meanlabel.set_prob_change(1.0f); // always present in classification problems
166 terminals.push_back(meanlabel);
167 }
168
169 return terminals;
170};
171
172std::unordered_map<std::size_t, std::string> ArgsName;
173
174void SearchSpace::print() const {
175 std::cout << fmt::format("{}\n", *this) << std::flush;
176}
177
178void SearchSpace::init(const Dataset& d, const unordered_map<string,float>& user_ops,
179 bool weights_init)
180{
181 // fmt::print("constructing search space...\n");
182 this->node_map.clear();
183 this->node_map_weights.clear();
184 this->terminal_map.clear();
185 this->terminal_types.clear();
186 this->terminal_weights.clear();
187 this->op_names.clear();
188
189 bool use_all = user_ops.size() == 0;
190 if (use_all)
191 {
192 for (size_t op_index=0; op_index< NodeTypes::Count; op_index++){
193 op_names.push_back(
194 NodeTypeName.at(static_cast<NodeType>(1UL << op_index))
195 );
196 }
197 }
198 else
199 {
200 for (const auto& [op, weight] : user_ops)
201 op_names.push_back(op);
202 }
203
204 // create nodes based on data types
206
207 vector<Node> terminals = generate_terminals(d, weights_init);
208
209 // If it is a classification problem, we need to add the fixed root nodes
210 // (logistic for binary classification, softmax for multiclassification).
211 // Sometimes, the user may not specify these two nodes as candidates when
212 // sampling functions, so we check if they are already in the terminal set, and
213 // we add them with zero prob if they are not. They need to be in the func set
214 // when calling GenerateNodeMap, so the search_space will contain all the hashes
215 // and signatures for them (and they can be used only in program root).
216 // TODO: fix softmax and add it here
217
218 /* fmt::print("generate nodetype\n"); */
219
220 if (d.classification)
221 {
222 std::unordered_map<std::string, float> extended_user_ops=user_ops;
223
224 if (!use_all)
225 {
226 // We need some ops in the search space so we can have the logit and offset
227 if (extended_user_ops.find("OffsetSum") == extended_user_ops.end()){
228 extended_user_ops.insert({"OffsetSum", 0.0f});
229 op_names.push_back("OffsetSum");
230 }
231
232 // Convert ArrayXf to std::vector<float> for compatibility with std::set
233 std::vector<float> vec(d.y.data(), d.y.data() + d.y.size());
234 std::set<float> unique_classes(vec.begin(), vec.end());
235
236 if (unique_classes.size()==2 && (extended_user_ops.find("Logistic") == extended_user_ops.end())) {
237 extended_user_ops.insert({"Logistic", 0.0f});
238 op_names.push_back("Logistic");
239 }
240 else if (extended_user_ops.find("Softmax") == extended_user_ops.end()) {
241 extended_user_ops.insert({"Softmax", 0.0f});
242 op_names.push_back("Softmax");
243 }
244 }
245
246 // fmt::print("generate nodetype\n");
247 GenerateNodeMap(extended_user_ops, terminal_types,
248 std::make_index_sequence<NodeTypes::OpCount>());
249 }
250 else
251 {
253 std::make_index_sequence<NodeTypes::OpCount>());
254 }
255
256 // map terminals
257 /* fmt::print("looping through terminals...\n"); */
258 for (const auto& term : terminals)
259 {
260 /* fmt::print("adding {} to search space...\n", term.get_name()); */
261 if (terminal_map.find(term.ret_type) == terminal_map.end())
262 terminal_map[term.ret_type] = vector<Node>();
263
264 /* fmt::print("terminal ret_type: {}\n", DataTypeName[term.ret_type]); */
265 terminal_map[term.ret_type].push_back(term);
266 terminal_weights[term.ret_type].push_back(term.get_prob_change());
267 }
268};
269
270std::optional<tree<Node>> SearchSpace::sample_subtree(Node root, int max_d, int max_size) const
271{
272 // public interface to use PTC2 algorithm
273
274 // PTC is designed to not fail (it will persistently try to find nodes with
275 // sampling functions). In pop initialization, this shoudnt be a problem, but
276 // during evolution, due to dynamic changes in node weights by the learners,
277 // it now may fail. We need to check, before calling it, that it has elements
278 // in search space to sample
279 auto ret_match = node_map.at(root.ret_type);
280
281 vector<float> args_w = get_weights(root.ret_type);
282
283 // at least one operator that matches the weight must have positive probability
284 if (!has_solution_space(args_w.begin(), args_w.end()))
285 return std::nullopt;
286
287 // it will always have a terminal (because we create constants).
288 // TODO: I guess I can remove this line below and it will still work
289 // if ( (terminal_map.find(root.ret_type) == terminal_map.end())
290 // || (!has_solution_space(terminal_weights.at(root.ret_type).begin(),
291 // terminal_weights.at(root.ret_type).end())) )
292 // return std::nullopt;
293
294 auto Tree = tree<Node>();
295 auto spot = Tree.insert(Tree.begin(), root);
296
297 // we should notice the difference between size of a PROGRAM and a TREE.
298 // program count weights in its size, while the TREE structure dont. Wenever
299 // using size of a program/tree, make sure you use the function from the correct class
300 PTC2(Tree, spot, max_d, max_size);
301
302 return Tree;
303};
304
305tree<Node>& SearchSpace::PTC2(tree<Node>& Tree,
306 tree<Node>::iterator spot, int max_d, int max_size) const
307{
308 // PTC2 is agnostic of program type
309
310 // A comment about PTC2 method:
311 // PTC2 can work with depth or size restriction, but it does not strictly
312 // satisfies these conditions all time. Given a `max_size` and `max_depth`
313 // parameters, the real maximum size that can occur is `max_size` plus the
314 // highest operator arity, and the real maximum depth is `max_depth` plus one.
315
316 // Queue of nodes that need children
317 vector<tuple<TreeIter, DataType, int>> queue;
318
319 // node depth
320 int d = 1;
321 // current tree size
322 int s = 1;
323
324 Node root = spot.node->data;
325
326 // updating size accordingly to root node
328 s += 2;
329 }
330 else if ( root.get_is_weighted()==true
332 s += 2;
333
334 //For each argument position a of n, Enqueue(a; g)
335 for (auto a : root.arg_types)
336 {
337 auto child_spot = Tree.append_child(spot);
338 queue.push_back(make_tuple(child_spot, a, d));
339 }
340
341 int max_arity = 4;
342
343 Node n;
344
345 // Now we actually start the PTC2 procedure to create the program tree.
346 // we are considering that all nodes in the queue will be weighted, so we
347 // multiply the queue size by 3
348 while ( 3*queue.size() + s < max_size && queue.size() > 0)
349 {
350 // including the queue size in the max_size, since each element in queue
351 // can grow up exponentially
352
353 // by default, terminals are weighted (counts as 3 nodes in program size).
354 // since every spot in queue has potential to be a terminal, we multiply
355 // its size by 3. Subtracting one due to the fact that this loop will
356 // always insert a non terminal (which by default has weights off).
357 // this way, we can have PTC2 working properly.
358
359 auto [qspot, t, d] = RandomDequeue(queue);
360
361 if (d >= max_d || s >= max_size)
362 {
363 auto opt = sample_terminal(t);
364
365 // if it returned optional, then there's nothing to sample based on weights.
366 // We'll force sampling again with uniform probs
367 if (!opt)
368 opt = sample_terminal(t, true);
369
370 assert (opt && "PTC2: failed to sample terminal to fill in empty leaves due to max depth or size");
371
372 // If we successfully get a terminal, use it
373 n = opt.value();
374
375 Tree.replace(qspot, n);
376 }
377 else
378 {
379 //choose a nonterminal of matching type
380 auto opt = sample_op(t);
381
382 if (!opt) { // there is no operator for this node. sample a terminal instead
383 opt = sample_terminal(t);
384
385 // didnt work the easy way, lets try the hard way
386 if (!opt)
387 opt = sample_terminal(t, true);
388 }
389
390 if (!opt) { // no operator nor terminal. weird.
391 auto msg = fmt::format("Failed to sample operator AND terminal of data type {} during PTC2.\n", DataTypeName[t]);
393
394 // queue.push_back(make_tuple(qspot, t, d));
395 // continue;
396 }
397
398 n = opt.value();
399
400 auto newspot = Tree.replace(qspot, n);
401
402 // For each arg of n, add to queue
403 for (auto a : n.arg_types)
404 {
405 auto child_spot = Tree.append_child(newspot);
406
407 queue.push_back(make_tuple(child_spot, a, d+1));
408 }
409 }
410
411 // increment is different based on node weights
412 ++s;
413
415 // cout << "sampled split on node/n";
416 s += 2;
417 }
418 else if ( n.get_is_weighted()==true
420 s += 2;
421 }
422
423 while (queue.size() > 0)
424 {
425 if (queue.size() == 0)
426 break;
427
428 auto [qspot, t, d] = RandomDequeue(queue);
429
430 auto opt = sample_terminal(t);
431
432 if (!opt)
433 opt = sample_terminal(t, true);
434
435 assert (opt && "PTC2: failed to sample terminal to fill remaining spots in queue");
436
437 n = opt.value();
438
439 auto newspot = Tree.replace(qspot, n);
440 }
441 return Tree;
442};
443
444// TODO: stop using params as a default argument and actually pass it (also update tests)
445RegressorProgram SearchSpace::make_regressor(int max_d, int max_size, const Parameters& params)
446{
447 return make_program<RegressorProgram>(params, max_d, max_size);
448};
449
450ClassifierProgram SearchSpace::make_classifier(int max_d, int max_size, const Parameters& params)
451{
452 return make_program<ClassifierProgram>(params, max_d, max_size);
453};
454
456 int max_d, int max_size, const Parameters& params)
457{
458 return make_program<MulticlassClassifierProgram>(params, max_d, max_size);
459};
460
461RepresenterProgram SearchSpace::make_representer(int max_d, int max_size, const Parameters& params)
462{
463 return make_program<RepresenterProgram>(params, max_d, max_size);
464};
465
466} //Brush
467
holds variable type data.
Definition data.h:51
bool classification
whether this is a classification problem
Definition data.h:83
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:74
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
ArrayXf y
length N array, the target label
Definition data.h:80
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition utils.cpp:359
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
NodeType
Definition nodetype.h:31
Program< PT::Representer > RepresenterProgram
Definition types.h:81
auto Isnt(DataType dt) -> bool
Definition node.h:43
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Program< PT::BinaryClassifier > ClassifierProgram
Definition types.h:79
DataType
data types.
Definition types.h:143
auto Is(NodeType nt) -> bool
Definition node.h:291
std::unordered_map< std::size_t, std::string > ArgsName
vector< Node > generate_terminals(const Dataset &d, const bool weights_init)
generate terminals from the dataset features and random constants.
T RandomDequeue(std::vector< T > &Q)
queue for make program
map< DataType, string > DataTypeName
Definition data.cpp:14
std::map< NodeType, std::string > NodeTypeName
Definition nodetype.cpp:81
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
Program< PT::Regressor > RegressorProgram
Definition types.h:78
float calc_initial_weight(const ArrayXf &value, const ArrayXf &y)
Program< PT::MulticlassClassifier > MulticlassClassifierProgram
Definition types.h:80
static constexpr size_t Count
Definition nodetype.h:121
class holding the data for a node in a tree.
Definition node.h:84
std::vector< DataType > arg_types
argument data types
Definition node.h:94
NodeType node_type
the node type
Definition node.h:89
DataType ret_type
return data type
Definition node.h:92
bool get_is_weighted() const
Definition node.h:269
void set_prob_change(float w)
Definition node.h:255
void print() const
prints the search space map.
Map< Node > node_map
Maps return types to argument types to node types.
unordered_map< DataType, vector< Node > > terminal_map
Maps return types to terminals.
void init(const Dataset &d, const unordered_map< string, float > &user_ops={}, bool weights_init=true)
Called by the constructor to initialize the search space.
RegressorProgram make_regressor(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random regressor program. Convenience wrapper for make_program.
unordered_map< DataType, vector< float > > terminal_weights
A map of weights corresponding to elements in terminal_map, used to weight probabilities of each term...
vector< float > get_weights() const
get weights of the return types
vector< string > op_names
A vector storing the available operator names (used by bandits).
void GenerateNodeMap(const unordered_map< string, float > &user_ops, const vector< DataType > &unique_data_types, std::index_sequence< Is... >)
tree< Node > & PTC2(tree< Node > &Tree, tree< Node >::iterator root, int max_d, int max_size) const
std::optional< Node > sample_op(DataType ret) const
get an operator matching return type ret.
RepresenterProgram make_representer(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random representer program. Convenience wrapper for make_program.
Map< float > node_map_weights
A map of weights corresponding to elements in node_map, used to weight probabilities of each node bei...
MulticlassClassifierProgram make_multiclass_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random multiclass classifier program. Convenience wrapper for make_program.
std::optional< tree< Node > > sample_subtree(Node root, int max_d, int max_size) const
create a subtree with maximum size and depth restrictions and root of type root_type
PT make_program(const Parameters &params, int max_d=0, int max_size=0)
Makes a random program.
vector< DataType > terminal_types
A vector storing the available return types of terminals.
bool has_solution_space(Iter start, Iter end) const
Takes iterators to weight vectors and checks if they have a non-empty solution space....
ClassifierProgram make_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random classifier program. Convenience wrapper for make_program.
std::optional< Node > sample_terminal(bool force_return=false) const
Get a random terminal.
normalizes a matrix to unit variance, 0 mean centered.
Definition utils.h:313
void fit_normalize(MatrixXf &X, const vector< char > &dtypes)
Definition utils.cpp:137