Brush C++ API
A flexible interpretable machine learning framework
Loading...
Searching...
No Matches
search_space.cpp
Go to the documentation of this file.
1#include "search_space.h"
2#include "../program/program.h" // TODO: dont import this header here
3
4namespace Brush{
5
6
7float calc_initial_weight(const ArrayXf& value, const ArrayXf& y)
8{
9 // OBS: only for terminals!
10
11 // weights are initialized as the slope of the z-score of x and y.
12
13 // If y has different length from X, we get a core dump in this function.
14 // That is why Dataset makes a check for this
15 // TODO: need to make SS (or Datasaet) check for this when loading the data
16
17 vector<char> dtypes = {'f', 'f'};
18
19 MatrixXf data(value.size(), 2);
20
21 data.col(0) << value;
22 data.col(1) << y;
23
24 Normalizer n(true);
25 n.fit_normalize(data, dtypes); // normalize works row-wise
26
27 // In slope function, argument order matters (if not normalized with z-score)
28 // The feature should be the first value, and the true value the second
29 // (it will divide covar(arg1, arg2) by var(arg2)).
30 // Since z-score normalizes so mean=0 and std=1, then order doesnt matter
31 float prob_change = std::abs(slope(data.col(0).array() , // x=variable
32 data.col(1).array() )); // y=target
33
34 // having a minimum feature weight if it was not set to zero
35 if (std::abs(prob_change)<1e-4)
36 prob_change = 1e-1;
37
38 // prob_change will evaluate to nan if variance(x)==0. Features with
39 // zero variance should not be used (as they behave just like a constant).
40 if (std::isnan(prob_change))
41 prob_change = 0.0;
42
43 return prob_change;
44}
45
46
51vector<Node> generate_terminals(const Dataset& d, const bool weights_init)
52{
53 vector<Node> terminals;
54 int i = 0;
55 for ( const auto &[feature_name, value]: d.features )
56 {
57 std::visit(
58 [&](auto&& arg) {
59 using T = std::decay_t<decltype(arg)>;
60 using Scalar = typename T::Scalar;
61 constexpr bool weighted = std::is_same_v<Scalar, float>;
62 // if constexpr (T::Scalar == float)
63 auto n = Node(
65 Signature<T()>{},
68 );
69
70 float prob_change = 1.0; // default value
71
72 if (d.y.size()>0 && weights_init)
73 {
74 // if the value can be casted to float array, we can calculate slope
75 if (std::holds_alternative<ArrayXf>(value) && d.y.size()>0)
76 {
77 prob_change = calc_initial_weight(std::get<ArrayXf>(value), d.y);
78 }
79 else if (std::holds_alternative<ArrayXi>(value))
80 {
81 // for each variable we create a one-vs-all binary variable, then
82 // calculate slope. Final value will be the average of slopes
83
84 auto tmp = std::get<ArrayXi>(value);
85
86 //get number of unique values
87 std::map<float, bool> uniqueMap;
88 for(int i = 0; i < tmp.size(); i++)
89 uniqueMap[(float)tmp(i)] = true;
90
91 ArrayXf slopes = ArrayXf::Ones(uniqueMap.size());
92 int slopesIterator = 0;
93 for (const auto& pair : uniqueMap)
94 {
95 auto one_vs_all = ArrayXf::Ones(tmp.size()).array() * (tmp.array()==pair.first).cast<float>();
96
98 }
99
100 prob_change = slopes.mean();
101 }
102 else if (std::holds_alternative<ArrayXb>(value))
103 {
104 auto tmp = std::get<ArrayXb>(value).template cast<float>();
105 prob_change = calc_initial_weight(tmp, d.y);
106 }
107 else
108 {
109 auto msg = fmt::format("Brush coudn't calculate the initial weight of variable {}\n",feature_name);
111 }
112 }
113
114 n.set_prob_change( prob_change );
115
116 terminals.push_back(n);
117 },
118 value
119 );
120 ++i;
121 };
122
123 // iterate through terminals and take the average of values of same signature
124 auto signature_avg = [terminals](DataType ret_type){
125 float sum = 0.0;
126 int count = 0;
127
128 for (const auto& n : terminals) {
129 if (n.ret_type == ret_type) {
130 sum += n.get_prob_change();
131 count++;
132 }
133 }
134
135 return sum / count;
136 };
137
138 // constants for each type
139 auto cXf = Node(NodeType::Constant, Signature<ArrayXf()>{}, true, "Cf");
140 float floats_avg_weights = signature_avg(cXf.ret_type);
141 cXf.set_prob_change(floats_avg_weights);
142 terminals.push_back(cXf);
143
144 auto cXi = Node(NodeType::Constant, Signature<ArrayXi()>{}, true, "Ci");
146 terminals.push_back(cXi);
147
148 auto cXb = Node(NodeType::Constant, Signature<ArrayXb()>{}, false, "Cb");
150 terminals.push_back(cXb);
151
152 // mean label node
153 auto meanlabel = Node(NodeType::MeanLabel, Signature<ArrayXf()>{}, true, "MeanLabel");
155 terminals.push_back(meanlabel);
156
157 return terminals;
158};
159
160std::unordered_map<std::size_t, std::string> ArgsName;
161
162void SearchSpace::print() const {
163 std::cout << fmt::format("{}\n", *this) << std::flush;
164}
165
166void SearchSpace::init(const Dataset& d, const unordered_map<string,float>& user_ops,
167 bool weights_init)
168{
169 // fmt::print("constructing search space...\n");
170 this->node_map.clear();
171 this->node_map_weights.clear();
172 this->terminal_map.clear();
173 this->terminal_types.clear();
174 this->terminal_weights.clear();
175
176 bool use_all = user_ops.size() == 0;
177 vector<string> op_names;
178 for (const auto& [op, weight] : user_ops)
179 op_names.push_back(op);
180
181 // create nodes based on data types
182 terminal_types = d.unique_data_types;
183
185
186 // If it is a classification problem, we need to add the fixed root nodes
187 // (logistic for binary classification, softmax for multiclassification).
188 // Sometimes, the user may not specify these two nodes as candidates when
189 // sampling functions, so we check if they are already in the terminal set, and
190 // we add them with zero prob if they are not. They need to be in the func set
191 // when calling GenerateNodeMap, so the search_space will contain all the hashes
192 // and signatures for them (and they can be used only in program root).
193 // TODO: fix softmax and add it here
194
195 // Copy the original map using the copy constructor
196 std::unordered_map<std::string, float> extended_user_ops(user_ops);
197
198 if (d.classification)
199 {
200 // Convert ArrayXf to std::vector<float> for compatibility with std::set
201 std::vector<float> vec(d.y.data(), d.y.data() + d.y.size());
202
203 std::set<float> unique_classes(vec.begin(), vec.end());
204
205 // We need some ops in the search space so we can have the logit and offset
206 if (user_ops.find("OffsetSum") == user_ops.end())
207 extended_user_ops.insert({"OffsetSum", 0.0f});
208
209 if (unique_classes.size()==2 && (user_ops.find("Logistic") == user_ops.end())) {
210 extended_user_ops.insert({"Logistic", 0.0f});
211 }
212 else if (user_ops.find("Softmax") == user_ops.end()) {
213 extended_user_ops.insert({"Softmax", 0.0f});
214 }
215 }
216
217 /* fmt::print("generate nodetype\n"); */
218 GenerateNodeMap(extended_user_ops, d.unique_data_types,
219 std::make_index_sequence<NodeTypes::OpCount>());
220
221 // map terminals
222 /* fmt::print("looping through terminals...\n"); */
223 for (const auto& term : terminals)
224 {
225 /* fmt::print("adding {} to search space...\n", term.get_name()); */
226 if (terminal_map.find(term.ret_type) == terminal_map.end())
227 terminal_map[term.ret_type] = vector<Node>();
228
229 /* fmt::print("terminal ret_type: {}\n", DataTypeName[term.ret_type]); */
230 terminal_map[term.ret_type].push_back(term);
231 terminal_weights[term.ret_type].push_back(term.get_prob_change());
232 }
233};
234
235std::optional<tree<Node>> SearchSpace::sample_subtree(Node root, int max_d, int max_size) const
236{
237 // public interface to use PTC2 algorithm
238
239 // PTC is designed to not fail (it will persistently try to find nodes with
240 // sampling functions). In pop initialization, this shoudnt be a problem, but
241 // during evolution, due to dynamic changes in node weights by the learners,
242 // it now may fail. We need to check, before calling it, that it has elements
243 // in search space to sample
244 auto ret_match = node_map.at(root.ret_type);
245
246 vector<float> args_w = get_weights(root.ret_type);
247
248 // at least one operator that matches the weight must have positive probability
249 if (!has_solution_space(args_w.begin(), args_w.end()))
250 return std::nullopt;
251
252 if ( (terminal_map.find(root.ret_type) == terminal_map.end())
253 || (!has_solution_space(terminal_weights.at(root.ret_type).begin(),
254 terminal_weights.at(root.ret_type).end())) )
255 return std::nullopt;
256
257 auto Tree = tree<Node>();
258 auto spot = Tree.insert(Tree.begin(), root);
259
260 // we should notice the difference between size of a PROGRAM and a TREE.
261 // program count weights in its size, while the TREE structure dont. Wenever
262 // using size of a program/tree, make sure you use the function from the correct class
263 PTC2(Tree, spot, max_d, max_size);
264
265 return Tree;
266};
267
269 tree<Node>::iterator spot, int max_d, int max_size) const
270{
271 // PTC2 is agnostic of program type
272
273 // A comment about PTC2 method:
274 // PTC2 can work with depth or size restriction, but it does not strictly
275 // satisfies these conditions all time. Given a `max_size` and `max_depth`
276 // parameters, the real maximum size that can occur is `max_size` plus the
277 // highest operator arity, and the real maximum depth is `max_depth` plus one.
278
279 // Queue of nodes that need children
280 vector<tuple<TreeIter, DataType, int>> queue;
281
282 // node depth
283 int d = 1;
284 // current tree size
285 int s = 1;
286
287 Node root = spot.node->data;
288
289 // updating size accordingly to root node
290 if (Is<NodeType::SplitBest>(root.node_type))
291 s += 3;
292 else if (Is<NodeType::SplitOn>(root.node_type))
293 s += 2;
294
295 if ( root.get_is_weighted()==true
297 s += 2;
298
299 //For each argument position a of n, Enqueue(a; g)
300 for (auto a : root.arg_types)
301 {
302 // cout << "queing a node of type " << DataTypeName[a] << endl;
303 auto child_spot = Tree.append_child(spot);
304 queue.push_back(make_tuple(child_spot, a, d));
305 }
306
307 int max_arity = 4;
308
309 Node n;
310 // Now we actually start the PTC2 procedure to create the program tree
311 while ( queue.size() + s < max_size && queue.size() > 0)
312 {
313 // including the queue size in the max_size, since each element in queue
314 // can grow up exponentially
315
316 // by default, terminals are weighted (counts as 3 nodes in program size).
317 // since every spot in queue has potential to be a terminal, we multiply
318 // its size by 3. Subtracting one due to the fact that this loop will
319 // always insert a non terminal (which by default has weights off).
320 // this way, we can have PTC2 working properly.
321
322 // cout << "queue size: " << queue.size() << endl;
323 auto [qspot, t, d] = RandomDequeue(queue);
324
325 // cout << "current depth: " << d << endl;
326 if (d >= max_d || s >= max_size)
327 {
328 auto opt = sample_terminal(t);
329
330 // if it returned optional, then there's nothing to sample based on weights.
331 // We'll force sampling again with uniform probs
332 if (!opt)
333 opt = sample_terminal(t, true);
334
335 // If we successfully get a terminal, use it
336 n = opt.value();
337
338 Tree.replace(qspot, n);
339 }
340 else
341 {
342 //choose a nonterminal of matching type
343 auto opt = sample_op(t);
344
345 if (!opt) { // there is no operator for this node. sample a terminal instead
347 }
348
349 if (!opt) { // no operator nor terminal. weird.
350 auto msg = fmt::format("Failed to sample operator AND terminal of data type {} during PTC2.\n", DataTypeName[t]);
352
353 // queue.push_back(make_tuple(qspot, t, d));
354 // continue;
355 }
356
357 n = opt.value();
358
359 auto newspot = Tree.replace(qspot, n);
360
361 // For each arg of n, add to queue
362 for (auto a : n.arg_types)
363 {
364 auto child_spot = Tree.append_child(newspot);
365
366 queue.push_back(make_tuple(child_spot, a, d+1));
367 }
368 }
369
370 // increment is different based on node weights
371 ++s;
372
373 if (Is<NodeType::SplitBest>(n.node_type))
374 s += 3;
375 else if (Is<NodeType::SplitOn>(n.node_type))
376 s += 2;
377
378 if ( n.get_is_weighted()==true
380 s += 2;
381 }
382
383 while (queue.size() > 0)
384 {
385 if (queue.size() == 0)
386 break;
387
388 auto [qspot, t, d] = RandomDequeue(queue);
389
390 auto opt = sample_terminal(t);
391 if (!opt)
392 opt = sample_terminal(t, true);
393
394 n = opt.value();
395
396 auto newspot = Tree.replace(qspot, n);
397 }
398 return Tree;
399};
400
401// TODO: stop using params as a default argument and actually pass it (also update tests)
403{
404 return make_program<RegressorProgram>(params, max_d, max_size);
405};
406
408{
409 return make_program<ClassifierProgram>(params, max_d, max_size);
410};
411
417
419{
420 return make_program<RepresenterProgram>(params, max_d, max_size);
421};
422
423} //Brush
424
void bind_engine(py::module &m, string name)
holds variable type data.
Definition data.h:51
#define HANDLE_ERROR_THROW(err)
Definition error.h:27
float slope(const ArrayXf &x, const ArrayXf &y)
slope of x/y
Definition utils.cpp:359
< nsga2 selection operator for getting the front
Definition data.cpp:12
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition types.h:39
DataType
data types.
Definition types.h:143
std::unordered_map< std::size_t, std::string > ArgsName
vector< Node > generate_terminals(const Dataset &d, const bool weights_init)
generate terminals from the dataset features and random constants.
T RandomDequeue(std::vector< T > &Q)
queue for make program
map< DataType, string > DataTypeName
Definition data.cpp:14
Eigen::Array< int, Eigen::Dynamic, 1 > ArrayXi
Definition types.h:40
float calc_initial_weight(const ArrayXf &value, const ArrayXf &y)
class holding the data for a node in a tree.
Definition node.h:84
void set_prob_change(float w)
Definition node.h:240
An individual program, a.k.a. model.
Definition program.h:50
void print() const
prints the search space map.
Map< Node > node_map
Maps return types to argument types to node types.
unordered_map< DataType, vector< Node > > terminal_map
Maps return types to terminals.
void init(const Dataset &d, const unordered_map< string, float > &user_ops={}, bool weights_init=true)
Called by the constructor to initialize the search space.
RegressorProgram make_regressor(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random regressor program. Convenience wrapper for make_program.
unordered_map< DataType, vector< float > > terminal_weights
A map of weights corresponding to elements in terminal_map, used to weight probabilities of each term...
vector< float > get_weights() const
get weights of the return types
void GenerateNodeMap(const unordered_map< string, float > &user_ops, const vector< DataType > &unique_data_types, std::index_sequence< Is... >)
tree< Node > & PTC2(tree< Node > &Tree, tree< Node >::iterator root, int max_d, int max_size) const
std::optional< Node > sample_op(DataType ret) const
get an operator matching return type ret.
RepresenterProgram make_representer(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random representer program. Convenience wrapper for make_program.
Map< float > node_map_weights
A map of weights corresponding to elements in node_map, used to weight probabilities of each node bei...
MulticlassClassifierProgram make_multiclass_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random multiclass classifier program. Convenience wrapper for make_program.
std::optional< tree< Node > > sample_subtree(Node root, int max_d, int max_size) const
create a subtree with maximum size and depth restrictions and root of type root_type
vector< DataType > terminal_types
A vector storing the available return types of terminals.
bool has_solution_space(Iter start, Iter end) const
Takes iterators to weight vectors and checks if they have a non-empty solution space....
ClassifierProgram make_classifier(int max_d=0, int max_size=0, const Parameters &params=Parameters())
Makes a random classifier program. Convenience wrapper for make_program.
std::optional< Node > sample_terminal(bool force_return=false) const
Get a random terminal.
normalizes a matrix to unit variance, 0 mean centered.
Definition utils.h:313