Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
search_space.cpp
Go to the documentation of this file.
1#include "search_space.h"
2#include "../program/program.h" // TODO: dont import this header here
3
4namespace Brush{
5
6
7float calc_initial_weight(const ArrayXf& value, const ArrayXf& y)
8{
9 // OBS: only for terminals!
10
11 // weights are initialized as the slope of the z-score of x and y.
12
13 // If y has different length from X, we get a core dump in this function.
14 // That is why Dataset makes a check for this
15 // TODO: need to make SS (or Datasaet) check for this when loading the data
16
17 vector<char> dtypes = {'f', 'f'};
18
19 MatrixXf data(value.size(), 2);
20
21 data.col(0) << value;
22 data.col(1) << y;
23
24 Normalizer n(true);
25 n.fit_normalize(data, dtypes); // normalize works row-wise
26
27 // In slope function, argument order matters (if not normalized with z-score)
28 // The feature should be the first value, and the true value the second
29 // (it will divide covar(arg1, arg2) by var(arg2)).
30 // Since z-score normalizes so mean=0 and std=1, then order doesnt matter
31 float prob_change = std::abs(slope(data.col(0).array() , // x=variable
32 data.col(1).array() )); // y=target
33
34 // having a minimum feature weight if it was not set to zero
35 if (std::abs(prob_change)<1e-2)
36 prob_change = 1e-2;
37
38 // prob_change will evaluate to nan if variance(x)==0. Features with
39 // zero variance should not be used (as they behave just like a constant).
40 if (std::isnan(prob_change))
41 prob_change = 0.0;
42
43 return prob_change;
44}
45
46
51vector<Node> generate_terminals(const Dataset& d, const bool weights_init)
52{
53 vector<Node> terminals;
54 int i = 0;
55 for ( const auto &[feature_name, value]: d.features )
56 {
57 std::visit(
58 [&](auto&& arg) {
59 using T = std::decay_t<decltype(arg)>;
60 using Scalar = typename T::Scalar;
61 constexpr bool weighted = std::is_same_v<Scalar, float>;
62 // if constexpr (T::Scalar == float)
63 auto n = Node(
65 Signature<T()>{},
66 weighted,
67 feature_name
68 );
69
70 float prob_change = 1.0; // default value
71
72 if (d.y.size()>0 && weights_init)
73 {
74 // if the value can be casted to float array, we can calculate slope
75 if (std::holds_alternative<ArrayXb>(value))
76 {
77 ArrayXf tmp = std::get<ArrayXb>(value).template cast<float>();
78 prob_change = calc_initial_weight(tmp, d.y);
79 }
80 else if (std::holds_alternative<ArrayXi>(value))
81 {
82 // for each variable we create a one-vs-all binary variable, then
83 // calculate slope. Final value will be the average of slopes
84
85 auto tmp = std::get<ArrayXi>(value);
86
87 //get number of unique values
88 std::map<float, bool> uniqueMap;
89 for(int i = 0; i < tmp.size(); i++)
90 uniqueMap[(float)tmp(i)] = true;
91
92 ArrayXf slopes = ArrayXf::Ones(uniqueMap.size());
93 int slopesIterator = 0;
94 for (const auto& pair : uniqueMap)
95 {
96 auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast<float>();
97
98 slopes[slopesIterator++] = calc_initial_weight(one_vs_all, d.y);
99 }
100
101 prob_change = slopes.mean();
102 }
103 else if (std::holds_alternative<ArrayXf>(value))
104 {
105 prob_change = calc_initial_weight(std::get<ArrayXf>(value), d.y);
106 }
107 else
108 {
109 auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name);
111 }
112 }
113
114 n.set_prob_change( prob_change );
115
116 terminals.push_back(n);
117 },
118 value
119 );
120 ++i;
121 };
122
123 // iterate through terminals and take the average of values of same signature
124 auto signature_avg = [terminals, weights_init](DataType ret_type){
125 if (!weights_init)
126 return 1.0f;
127
128 float sum = 0.0;
129 int count = 0;
130
131 for (const auto& n : terminals) {
132 if (n.ret_type == ret_type && n.get_prob_change()>0) {
133 sum += n.get_prob_change();
134 count++;
135 }
136 }
137
138 if (count==0) // no terminals of datatype. return 1.0 for the constant
139 return 1.0f;
140
141 return sum / count;
142 };
143
144 // constants for each type -- floats, integers, boolean. This is useful
145 // to ensure the search space will always have something to fill up open spaces
146 // when building/modifying trees
147 auto cXf = Node(NodeType::Constant, Signature<ArrayXf()>{}, true, "Constant");
148 cXf.set_prob_change(signature_avg(cXf.ret_type));
149 terminals.push_back(cXf);
150
151 // Reminder: Integers are used to represent categorical variables
152 auto cXi = Node(NodeType::Constant, Signature<ArrayXi()>{}, true, "Constant");
153 cXi.set_prob_change(signature_avg(cXi.ret_type));
154 terminals.push_back(cXi);
155
156 auto cXb = Node(NodeType::Constant, Signature<ArrayXb()>{}, false, "Constant");
157 cXb.set_prob_change(signature_avg(cXb.ret_type));
158 terminals.push_back(cXb);
159
160 // mean label node. Does not need to be part of symbols. works only for classification
161 if (d.classification)
162 {
163 auto meanlabel = Node(NodeType::MeanLabel, Signature<ArrayXb()>{}, true, "MeanLabel");
164 meanlabel.set_prob_change(1.0f); // always present in classification problems
165 terminals.push_back(meanlabel);
166 }
167
168 return terminals;
169};
170
171std::unordered_map<std::size_t, std::string> ArgsName;
172
173void SearchSpace::print() const {
174 std::cout << fmt::format("{}\n", *this) << std::flush;
175}
176
177void SearchSpace::init(const Dataset& d, const unordered_map<string,float>& user_ops,
178 bool weights_init)
179{
180 // fmt::print("constructing search space...\n");
181 this->node_map.clear();
182 this->node_map_weights.clear();
183 this->terminal_map.clear();
184 this->terminal_types.clear();
185 this->terminal_weights.clear();
186 this->op_names.clear();
187
188 bool use_all = user_ops.size() == 0;
189 if (use_all)
190 {
191 for (size_t op_index=0; op_index< NodeTypes::Count; op_index++){
192 op_names.push_back(
193 NodeTypeName.at(static_cast<NodeType>(1UL << op_index))
194 );
195 }
196 }
197
198 for (const auto& [op, weight] : user_ops)
199 op_names.push_back(op);
200
201 // create nodes based on data types
203
204 vector<Node> terminals = generate_terminals(d, weights_init);
205
206 // If it is a classification problem, we need to add the fixed root nodes
207 // (logistic for binary classification, softmax for multiclassification).
208 // Sometimes, the user may not specify these two nodes as candidates when
209 // sampling functions, so we check if they are already in the terminal set, and
210 // we add them with zero prob if they are not. They need to be in the func set
211 // when calling GenerateNodeMap, so the search_space will contain all the hashes
212 // and signatures for them (and they can be used only in program root).
213 // TODO: fix softmax and add it here
214
215 /* fmt::print("generate nodetype\n"); */
217 std::make_index_sequence<NodeTypes::OpCount>());
218
219 if (d.classification)
220 {
221 std::unordered_map<std::string, float> extended_user_ops;
222
223 // Convert ArrayXf to std::vector<float> for compatibility with std::set
224 std::vector<float> vec(d.y.data(), d.y.data() + d.y.size());
225
226 std::set<float> unique_classes(vec.begin(), vec.end());
227
228 // We need some ops in the search space so we can have the logit and offset
229 if (user_ops.find("OffsetSum") == user_ops.end())
230 extended_user_ops.insert({"OffsetSum", -1.0f});
231 op_names.push_back("OffsetSum");
232
233 if (unique_classes.size()==2 && (user_ops.find("Logistic") == user_ops.end())) {
234 extended_user_ops.insert({"Logistic", -1.0f});
235 op_names.push_back("Logistic");
236 }
237 else if (user_ops.find("Softmax") == user_ops.end()) {
238 extended_user_ops.insert({"Softmax", -1.0f});
239 op_names.push_back("Softmax");
240 }
241
242 if (extended_user_ops.size() > 0)
243 {
244 // fmt::print("generate nodetype\n");
245 GenerateNodeMap(extended_user_ops, d.unique_data_types,
246 std::make_index_sequence<NodeTypes::OpCount>());
247 }
248 }
249
250 // map terminals
251 /* fmt::print("looping through terminals...\n"); */
252 for (const auto& term : terminals)
253 {
254 /* fmt::print("adding {} to search space...\n", term.get_name()); */
255 if (terminal_map.find(term.ret_type) == terminal_map.end())
256 terminal_map[term.ret_type] = vector<Node>();
257
258 /* fmt::print("terminal ret_type: {}\n", DataTypeName[term.ret_type]); */
259 terminal_map[term.ret_type].push_back(term);
260 terminal_weights[term.ret_type].push_back(term.get_prob_change());
261 }
262};
263
264std::optional<tree<Node>> SearchSpace::sample_subtree(Node root, int max_d, int max_size) const
265{
266 // public interface to use PTC2 algorithm
267
268 // PTC is designed to not fail (it will persistently try to find nodes with
269 // sampling functions). In pop initialization, this shoudnt be a problem, but
270 // during evolution, due to dynamic changes in node weights by the learners,
271 // it now may fail. We need to check, before calling it, that it has elements
272 // in search space to sample
273 auto ret_match = node_map.at(root.ret_type);
274
275 vector<float> args_w = get_weights(root.ret_type);
276
277 // at least one operator that matches the weight must have positive probability
278 if (!has_solution_space(args_w.begin(), args_w.end()))
279 return std::nullopt;
280
281 // it will always have a terminal (because we create constants).
282 // TODO: I guess I can remove this line below and it will still work
283 // if ( (terminal_map.find(root.ret_type) == terminal_map.end())
284 // || (!has_solution_space(terminal_weights.at(root.ret_type).begin(),
285 // terminal_weights.at(root.ret_type).end())) )
286 // return std::nullopt;
287
288 auto Tree = tree<Node>();
289 auto spot = Tree.insert(Tree.begin(), root);
290
291 // we should notice the difference between size of a PROGRAM and a TREE.
292 // program count weights in its size, while the TREE structure dont. Wenever
293 // using size of a program/tree, make sure you use the function from the correct class
294 PTC2(Tree, spot, max_d, max_size);
295
296 return Tree;
297};
298
299tree<Node>& SearchSpace::PTC2(tree<Node>& Tree,
300 tree<Node>::iterator spot, int max_d, int max_size) const
301{
302 // PTC2 is agnostic of program type
303
304 // A comment about PTC2 method:
305 // PTC2 can work with depth or size restriction, but it does not strictly
306 // satisfies these conditions all time. Given a `max_size` and `max_depth`
307 // parameters, the real maximum size that can occur is `max_size` plus the
308 // highest operator arity, and the real maximum depth is `max_depth` plus one.
309
310 // Queue of nodes that need children
311 vector<tuple<TreeIter, DataType, int>> queue;
312
313 // node depth
314 int d = 1;
315 // current tree size
316 int s = 1;
317
318 Node root = spot.node->data;
319
320 // updating size accordingly to root node
322 s += 3;
323 else if (Is<NodeType::SplitOn>(root.node_type)){
324 s += 2;
325 // cout << "sampled split on node\n";
326 }
327
328 if ( root.get_is_weighted()==true
330 s += 2;
331
332 //For each argument position a of n, Enqueue(a; g)
333 for (auto a : root.arg_types)
334 {
335 auto child_spot = Tree.append_child(spot);
336 queue.push_back(make_tuple(child_spot, a, d));
337 }
338
339 int max_arity = 4;
340
341 Node n;
342 // Now we actually start the PTC2 procedure to create the program tree
343 while ( queue.size() + s < max_size && queue.size() > 0)
344 {
345 // including the queue size in the max_size, since each element in queue
346 // can grow up exponentially
347
348 // by default, terminals are weighted (counts as 3 nodes in program size).
349 // since every spot in queue has potential to be a terminal, we multiply
350 // its size by 3. Subtracting one due to the fact that this loop will
351 // always insert a non terminal (which by default has weights off).
352 // this way, we can have PTC2 working properly.
353
354 auto [qspot, t, d] = RandomDequeue(queue);
355
356 if (d >= max_d || s >= max_size)
357 {
358 auto opt = sample_terminal(t);
359
360 // if it returned optional, then there's nothing to sample based on weights.
361 // We'll force sampling again with uniform probs
362 if (!opt)
363 opt = sample_terminal(t, true);
364
365 // If we successfully get a terminal, use it
366 n = opt.value();
367
368 Tree.replace(qspot, n);
369 }
370 else
371 {
372 //choose a nonterminal of matching type
373 auto opt = sample_op(t);
374
375 if (!opt) { // there is no operator for this node. sample a terminal instead
376 opt = sample_terminal(t);
377
378 // didnt work the easy way, lets try the hard way
379 if (!opt)
380 opt = sample_terminal(t, true);
381 }
382
383 if (!opt) { // no operator nor terminal. weird.
384 auto msg = fmt::format("Failed to sample operator AND terminal of data type {} during PTC2.\n", DataTypeName[t]);
386
387 // queue.push_back(make_tuple(qspot, t, d));
388 // continue;
389 }
390
391 n = opt.value();
392 // if (Is<NodeType::And>(n.node_type)){
393 // cout << "AND\n";
394 // }
395
396 auto newspot = Tree.replace(qspot, n);
397
398 // For each arg of n, add to queue
399 for (auto a : n.arg_types)
400 {
401 auto child_spot = Tree.append_child(newspot);
402
403 queue.push_back(make_tuple(child_spot, a, d+1));
404 }
405 }
406
407 // increment is different based on node weights
408 ++s;
409
411 s += 3;
412 else if (Is<NodeType::SplitOn>(n.node_type)){
413 // cout << "sampled split on node/n";
414 s += 2;
415 }
416
417 if ( n.get_is_weighted()==true
419 s += 2;
420 }
421
422 while (queue.size() > 0)
423 {
424 if (queue.size() == 0)
425 break;
426
427 auto [qspot, t, d] = RandomDequeue(queue);
428
429 auto opt = sample_terminal(t);
430
431 if (!opt)
432 opt = sample_terminal(t, true);
433
434 n = opt.value();
435
436 auto newspot = Tree.replace(qspot, n);
437 }
438 return Tree;
439};
440
441// TODO: stop using params as a default argument and actually pass it (also update tests)
442RegressorProgram SearchSpace::make_regressor(int max_d, int max_size, const Parameters& params)
443{
444 return make_program<RegressorProgram>(params, max_d, max_size);
445};
446
447ClassifierProgram SearchSpace::make_classifier(int max_d, int max_size, const Parameters& params)
448{
449 return make_program<ClassifierProgram>(params, max_d, max_size);
450};
451
453 int max_d, int max_size, const Parameters& params)
454{
455 return make_program<MulticlassClassifierProgram>(params, max_d, max_size);
456};
457
458RepresenterProgram SearchSpace::make_representer(int max_d, int max_size, const Parameters& params)
459{
460 return make_program<RepresenterProgram>(params, max_d, max_size);
461};
462
463} //Brush
464
holds variable type data.
Definition data.h:51
bool classification
whether this is a classification problem
Definition data.h:83
std::map< string, State > features
dataset features, as key value pairs
Definition data.h:74
std::vector< DataType > unique_data_types
keeps track of the unique data types in the dataset.
Definition data.h:62
ArrayXf y
length N array, the target label
Definition data.h:80
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition utils.cpp:359
< nsga2 selection operator for getting the front
Definition bandit.cpp:4
NodeType
Definition nodetype.h:31
Program< PT::Representer > RepresenterProgram
Definition types.h:81
auto Isnt(DataType dt) -> bool
Definition node.h:43
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
Program< PT::BinaryClassifier > ClassifierProgram
Definition types.h:79
DataType
data types.
Definition types.h:143
auto Is(NodeType nt) -> bool
Definition node.h:272
std::unordered_map< std::size_t, std::string > ArgsName
vector< Node > generate_terminals(const Dataset &d, const bool weights_init)
generate terminals from the dataset features and random constants.
T RandomDequeue(std::vector< T > &Q)
queue for make program
map< DataType, string > DataTypeName
Definition data.cpp:14
std::map< NodeType, std::string > NodeTypeName
Definition nodetype.cpp:81
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
Program< PT::Regressor > RegressorProgram
Definition types.h:78
float calc_initial_weight(const ArrayXf &value, const ArrayXf &y)
Program< PT::MulticlassClassifier > MulticlassClassifierProgram
Definition types.h:80
static constexpr size_t Count
Definition nodetype.h:120
class holding the data for a node in a tree.
Definition node.h:84
std::vector< DataType > arg_types
argument data types
Definition node.h:103
NodeType node_type
the node type
Definition node.h:95
DataType ret_type
return data type
Definition node.h:101
bool get_is_weighted() const
Definition node.h:255
void set_prob_change(float w)
Definition node.h:246
void print() const
prints the search space map.
Map< Node > node_map
Maps return types to argument types to node types.
unordered_map< DataType, vector< Node > > terminal_map
Maps return types to terminals.
void init(const Dataset &d, const unordered_map< string, float > &user_ops={}, bool weights_init=true)
Called by the constructor to initialize the search space.
RegressorProgram make_regressor(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random regressor program. Convenience wrapper for make_program.
unordered_map< DataType, vector< float > > terminal_weights
A map of weights corresponding to elements in terminal_map, used to weight probabilities of each term...
vector< float > get_weights() const
get weights of the return types
vector< string > op_names
A vector storing the available operator names (used by bandits).
void GenerateNodeMap(const unordered_map< string, float > &user_ops, const vector< DataType > &unique_data_types, std::index_sequence< Is... >)
tree< Node > & PTC2(tree< Node > &Tree, tree< Node >::iterator root, int max_d, int max_size) const
std::optional< Node > sample_op(DataType ret) const
get an operator matching return type ret.
RepresenterProgram make_representer(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random representer program. Convenience wrapper for make_program.
Map< float > node_map_weights
A map of weights corresponding to elements in node_map, used to weight probabilities of each node bei...
MulticlassClassifierProgram make_multiclass_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random multiclass classifier program. Convenience wrapper for make_program.
std::optional< tree< Node > > sample_subtree(Node root, int max_d, int max_size) const
create a subtree with maximum size and depth restrictions and root of type root_type
PT make_program(const Parameters &params, int max_d=0, int max_size=0)
Makes a random program.
vector< DataType > terminal_types
A vector storing the available return types of terminals.
bool has_solution_space(Iter start, Iter end) const
Takes iterators to weight vectors and checks if they have a non-empty solution space....
ClassifierProgram make_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random classifier program. Convenience wrapper for make_program.
std::optional< Node > sample_terminal(bool force_return=false) const
Get a random terminal.
normalizes a matrix to unit variance, 0 mean centered.
Definition utils.h:313
void fit_normalize(MatrixXf &X, const vector< char > &dtypes)
Definition utils.cpp:137