Feat C++ API
A feature engineering automation tool
fair_lexicase.cc
Go to the documentation of this file.
1 /* FEWTWO
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "fair_lexicase.h"
7 
8 namespace FT{
9 
10 namespace Sel{
11 
13 {
14  name = "fair_lexicase";
15  survival = surv;
16 }
17 
19 
21  const Parameters& params, const Data& d)
22 {
33  //< number of samples
34  unsigned int N = pop.individuals.at(0).error.size();
35  //< number of individuals
36  unsigned int P = pop.individuals.size();
37 
38  vector<size_t> starting_pool;
39  for (int i = 0; i < pop.individuals.size(); ++i)
40  {
41  starting_pool.push_back(i);
42  }
43  assert(starting_pool.size() == P);
44 
45  // selected individuals
46  vector<size_t> selected(P,0);
47  #pragma omp parallel for
48  for (unsigned int i = 0; i<P; ++i) // selection loop
49  {
50  vector<size_t> pool = starting_pool; // initial pool
51  vector<size_t> winner; // winners
52 
53 
54  vector<size_t> shuff_idx; // used to shuffle cases
55  map<int,vector<float>> protect_levels;
56  vector<float> used_levels;
57  if (!d.cases.empty())
58  {
59  /* cout << "d.cases is not empty; using...\n"; */
60  shuff_idx.resize(d.cases.size());
61  std::iota(shuff_idx.begin(),shuff_idx.end(),0);
62  // shuffle cases
63  r.shuffle(shuff_idx.begin(),shuff_idx.end());
64  }
65  else
66  {
67  // store a local copy of protect levels to prevent repeats
68  if (d.protect_levels.size() == 0)
69  {
70  /* cout << "d.protect_levels empty"; */
71  }
72  protect_levels = d.protect_levels;
73  }
74 
75 
76  bool pass = true; // checks pool size and number of cases
77  unsigned int h = 0; // case count
78 
79  while(pass){ // main loop
80 
81  // **Fairness Subgroups**
82  // Here, a "case" is the mean fitness over a collection of
83  // samples sharing an intersection of levels of
84  // protected groups.
85 
86  // if the cases haven't been enumerated,
87  // we sample group intersections.
88  ArrayXb x_idx;
89  if (d.cases.empty())
90  {
91  /* cout << "d.cases is empty, so sampling group " */
92  /* << "intersections\n"; */
93  /* cout << "current protect_levels:\n"; */
94  /* for (auto pl : protect_levels) */
95  /* { */
96  /* cout << "\tfeature " << pl.first << ":" */
97  /* << pl.second.size() << " values; "; */
98  /* } */
99  /* cout << "\n"; */
100  vector<int> groups;
101  // push back current groups (keys)
102  for (auto pl : protect_levels)
103  {
104  groups.push_back(pl.first);
105  }
106  x_idx = ArrayXb::Constant(d.X.cols(),true);
107  /* cout << "x_idx sum: " << x_idx.count() << "\n"; */
108  // choose a random group
109  vector<size_t> choice_idxs(groups.size());
110  std::iota(choice_idxs.begin(),choice_idxs.end(),0);
111  size_t idx = r.random_choice(choice_idxs);
112  int g = groups.at(idx);
113  /* cout << "chosen group: " << g << "\n"; */
114  // choose a random level
115  vector<float> lvls = unique(VectorXf(d.X.row(g)));
116  // remove levels not in protect_levels[g]
117  for (int i = lvls.size()-1; i --> 0; )
118  {
119  // ^ that's a backwards loop alright!
120  if (!in(protect_levels.at(g),lvls.at(i)))
121  lvls.erase(lvls.begin() + i);
122  }
123  float level = r.random_choice(lvls);
124  /* cout << "chosen level: " << level << "\n"; */
125  // remove chosen level from protect_levels
126  for (int i = 0; i < protect_levels.at(g).size(); ++i)
127  {
128  if (protect_levels[g].at(i) == level)
129  {
130  protect_levels[g].erase(
131  protect_levels[g].begin() + i);
132  break;
133  }
134  }
135 
136  x_idx = (d.X.row(g).array() == level);
137  /* cout << "x_idx count: " << x_idx.count() << "\n"; */
138  }
139  const ArrayXb& case_idx = (d.cases.empty() ?
140  x_idx : d.cases.at(shuff_idx[h]));
141  // get fitness of everyone in the pool
142  ArrayXf fitness(pool.size());
143  for (size_t j = 0; j<pool.size(); ++j)
144  {
145  if (r() < 0.5)
146  {
147  // half the time use loss
148  fitness(j) = case_idx.select(
149  pop.individuals.at(pool[j]).error,
150  0).sum();
151  }
152  else
153  {
154  // half the time, look at fairness
155  fitness(j) = fabs(pop.individuals.at(pool[j]).error.sum()
156  - case_idx.select(
157  pop.individuals.at(pool[j]).error,
158  0).sum());
159  }
160  }
161  // get epsilon for the fitnesses
162  float epsilon = mad(fitness);
163  //
164  winner.resize(0); // winners
165  // minimum error on case
166  float minfit = std::numeric_limits<float>::max();
167 
168  // get minimum
169  for (size_t j = 0; j<pool.size(); ++j)
170  {
171  if (fitness(j) < minfit)
172  minfit = fitness(j);
173  }
174 
175  // select best
176  for (size_t j = 0; j<pool.size(); ++j)
177  {
178  if (fitness(j) <= minfit+epsilon)
179  winner.push_back(pool[j]);
180  }
181 
182  /* cout << "h: " << h << endl; */
183  /* cout << "winner size: " << winner.size() << "\n"; */
184  ++h; // next case
185  // only keep going if needed
186  pass = (winner.size()>1
187  && h<shuff_idx.size());
188 
189  if(winner.size() == 0)
190  {
191  if(h >= d.group_intersections)
192  winner.push_back(r.random_choice(pool));
193  else
194  pass = true;
195  }
196  else
197  {
198  // reduce pool to remaining individuals
199  pool = winner;
200  }
201 
202  }
203 
204  assert(winner.size()>0);
205  //if more than one winner, pick randomly
206  selected.at(i) = r.random_choice(winner);
207  }
208 
209  if (selected.size() != pop.individuals.size())
210  {
211  std::cout << "selected: " ;
212  for (auto s: selected) std::cout << s << " "; std::cout << "\n";
213  THROW_LENGTH_ERROR("Lexicase did not select correct number of \
214  parents");
215  }
216  return selected;
217 }
218 
220  const Parameters& params, const Data& d)
221 {
222  /* FairLexicase survival */
223  THROW_RUNTIME_ERROR("Lexicase survival not implemented");
224  return vector<size_t>();
225 }
226 
227 }
228 
229 }
data holding X, y, and Z data
Definition: data.h:42
vector< ArrayXb > cases
Definition: data.h:64
int group_intersections
Definition: data.h:63
map< int, vector< float > > protect_levels
Definition: data.h:61
MatrixXf & X
Definition: data.h:45
T random_choice(const vector< T > &v)
Definition: rnd.h:73
void shuffle(RandomAccessIterator first, RandomAccessIterator last)
Definition: rnd.h:53
Eigen::Array< bool, Eigen::Dynamic, 1 > ArrayXb
Definition: data.h:21
#define THROW_LENGTH_ERROR(err)
Definition: error.h:32
#define THROW_RUNTIME_ERROR(err)
Definition: error.h:30
T pop(vector< T > *v)
Definition: auto_backprop.h:49
float mad(const ArrayXf &x)
median absolute deviation
Definition: utils.cc:189
bool in(const vector< T > v, const T &i)
check if element is in vector.
Definition: utils.h:47
vector< T > unique(vector< T > w)
returns unique elements in vector
Definition: utils.h:336
static Rnd & r
Definition: rnd.h:135
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552
holds the hyperparameters for Feat.
Definition: params.h:25
Defines a population of programs and functions for constructing them.
Definition: population.h:28
vector< size_t > survive(Population &pop, const Parameters &params, const Data &d)
lexicase survival
vector< size_t > select(Population &pop, const Parameters &params, const Data &d)
function returns a set of selected indices