Feat C++ API
A feature engineering automation tool
io.cc
Go to the documentation of this file.
1 /* FEAT
2 copyright 2017 William La Cava
3 license: GNU/GPL v3
4 */
5 
6 #include "io.h"
7 #include "utils.h"
8 /* #include "rnd.h" */
9 #include <unordered_set>
10 
11 namespace FT{
12 
13 namespace Util{
14 
15 void printProgress (float percentage)
16 {
17  int val = (int) (percentage * 100);
18  int lpad = (int) (percentage * PBWIDTH);
19  int rpad = PBWIDTH - lpad;
20  printf ("\rCompleted %3d%% [%.*s%*s]", val, lpad, PBSTR.c_str(), rpad, "");
21  fflush (stdout);
22  if(val == 100)
23  cout << "\n";
24 }
25 
27 void load_csv (const std::string & path, MatrixXf& X, VectorXf& y,
28  vector<string>& names, vector<char> &dtypes, bool& binary_endpoint,
29  char sep)
30 {
31  std::ifstream indata;
32  indata.open(path);
33  if (!indata.good())
34  THROW_INVALID_ARGUMENT("Invalid input file " + path + "\n");
35 
36  std::string line;
37  std::vector<float> values, targets;
38  unsigned rows=0, col=0, target_col = 0;
39 
40  while (std::getline(indata, line))
41  {
42  std::stringstream lineStream(line);
43  std::string cell;
44 
45  while (std::getline(lineStream, cell, sep))
46  {
47  cell = trim(cell);
48 
49  if (rows==0) // read in header
50  {
51  if (!cell.compare("class") || !cell.compare("target")
52  || !cell.compare("label"))
53  target_col = col;
54  else
55  names.push_back(cell);
56  }
57  else if (col != target_col)
58  values.push_back(std::stod(cell));
59  else
60  targets.push_back(std::stod(cell));
61 
62  ++col;
63  }
64  ++rows;
65  col=0;
66  }
67 
68  X = Map<MatrixXf>(values.data(), values.size()/(rows-1), rows-1);
69  y = Map<VectorXf>(targets.data(), targets.size());
70 
71  if (X.cols() != y.size())
72  THROW_LENGTH_ERROR("different numbers of samples in X and y");
73  if (X.rows() != names.size())
74  {
75  string error_msg = "header missing or incorrect number of "
76  "feature names\n";
77  error_msg += "X size: " + to_string(X.rows()) + "x"
78  + to_string(X.cols()) +"\n";
79  error_msg += "feature names: ";
80  for (auto fn: names)
81  error_msg += fn + ",";
82  THROW_LENGTH_ERROR(error_msg);
83  }
84 
85  dtypes = find_dtypes(X);
86 
87  string print_dtypes = "dtypes: ";
88  for (unsigned i = 0; i < dtypes.size(); ++i)
89  print_dtypes += (names.at(i) + " (" + to_string(dtypes.at(i))
90  + "), ");
91  print_dtypes += "\n";
92  cout << print_dtypes;
93 
94  // check if endpoint is binary
95  binary_endpoint = (y.array() == 0 || y.array() == 1).all();
96 
97 }
98 
100 void load_longitudinal(const std::string & path,
101  std::map<string, std::pair<vector<ArrayXf>, vector<ArrayXf> > > &Z,
102  char sep)
103 {
104  std::map<string, std::map<int, std::pair<vector<float>, vector<float> > > > dataMap;
105  std::ifstream indata;
106  indata.open(path);
107  if (!indata.good())
108  THROW_INVALID_ARGUMENT("Invalid input file " + path + "\n");
109 
110  std::string line, firstKey = "";
111 
112  string header;
113  std::getline(indata, header);
114 
115  std::stringstream lineStream(header);
116 
117  std::map<string,int> head_to_col;
118  for (int i = 0; i<4; ++i)
119  {
120  string tmp;
121  std::getline(lineStream,tmp, sep);
122  head_to_col[tmp] = i;
123  }
124 
125  while (std::getline(indata, line))
126  {
127  std::stringstream lineStream(line);
128  std::string sampleNo, value, time, type;
129 
130  vector<string> cols(4);
131  std::getline(lineStream, cols.at(0), sep);
132  std::getline(lineStream, cols.at(1), sep);
133  std::getline(lineStream, cols.at(2), sep);
134  std::getline(lineStream, cols.at(3), sep);
135 
136  sampleNo = cols.at(head_to_col.at("id"));
137  time = cols.at(head_to_col.at("date"));
138  value = cols.at(head_to_col.at("value"));
139  type = cols.at(head_to_col.at("name"));
140 
141  type = trim(type);
142 
143  if(!firstKey.compare(""))
144  firstKey = type;
145  /* cout << "sampleNo: " << sampleNo << ", time: " << time << ", value: " << value */
146  /* << ", type: " << type << "\n"; */
147  dataMap[type][std::stoi(sampleNo)].first.push_back(std::stod(value));
148  dataMap[type][std::stoi(sampleNo)].second.push_back(std::stod(time));
149  }
150 
151  int numVars = dataMap.size();
152  int numSamples = dataMap.at(firstKey).size();
153  int x;
154 
155  for ( const auto &val: dataMap )
156  {
157  for(x = 0; x < numSamples; ++x)
158  {
159  ArrayXf arr1 = Map<ArrayXf>(dataMap.at(val.first).at(x).first.data(),
160  dataMap.at(val.first).at(x).first.size());
161  ArrayXf arr2 = Map<ArrayXf>(dataMap.at(val.first).at(x).second.data(),
162  dataMap.at(val.first).at(x).second.size());
163  Z[val.first].first.push_back(arr1);
164  Z[val.first].second.push_back(arr2);
165 
166  }
167 
168  }
169 
170 }
171 
175 void load_partial_longitudinal(const std::string & path,
176  std::map<string, std::pair<vector<ArrayXf>, vector<ArrayXf> > > &Z,
177  char sep, const vector<int>& idx)
178 {
179  /* loads data from the longitudinal file, with idx providing the id numbers of each
180  * row in the main data (X and y).
181  * I.e., idx[k] = the id of samples in Z associated with sample k in X and y
182  */
183  /* cout << "in load_partial_longitudinal\n"; */
184  /* cout << idx.size() << " indices\n"; */
185  /* for (unsigned i = 0; i<idx.size(); ++i) */
186  /* cout << i << "," << idx[i] << "\n"; */
187  std::unordered_set<int> idSet; //(idx.begin(), idx.end());
188 
189  std::map<int, vector<int>> idLoc; // maps IDs to X/y row index (i.e. Loc)
190  std::map<int, int> locID; // maps X/y row indices (i.e. loc) to sample IDs
191  unsigned i = 0;
192  for(const auto& id : idx)
193  {
194  auto tmp = idSet.insert(id);
195  if (!tmp.second || *tmp.first != id)
196  {
197  if(idSet.find(id) == idSet.end())
198  {
199  cout << "failed to find " << id << " in idSet\n";
200  cout << "retrying..\n";
201  int blrg=0;
202  while (blrg<100 && (!tmp.second || *tmp.first != id) )
203  {
204  auto tmp = idSet.insert(id);
205  blrg++;
206  }
207  if (blrg == 100)
208  THROW_RUNTIME_ERROR("insert failed on i = "
209  + std::to_string(i) + " id = "
210  + std::to_string(id));
211  }
212  }
213  idLoc[id].push_back(i);
214  locID[i] = id;
215  ++i;
216  }
217  /* cout << "idSet size: " << idSet.size() << "\n"; */
218  /* cout << "idx size: " << idx.size() << "\n"; */
219  /* if (idSet.size() != idx.size()) */
220  /* { */
221  /* THROW_RUNTIME_ERROR("Sample IDs must be unique"); */
222  /* } */
223  /* cout << "\n"; */
224  // dataMap maps from the variable name (string) to a map containing
225  // 1) the sample row index in X/y, and 2) a pair consisting of
226  // - the variable value (first) and
227  // - variable date (second)
228  std::map<string, std::map<int, std::pair<vector<float>, vector<float> > > > dataMap;
229  std::ifstream indata;
230  indata.open(path);
231  if (!indata.good())
232  THROW_INVALID_ARGUMENT("Invalid input file " + path + "\n");
233 
234  std::string line, firstKey = "";
235 
236  // get header
237  string header;
238  std::getline(indata, header);
239 
240  std::stringstream lineStream(header);
241 
242  std::map<string,int> head_to_col;
243  for (int i = 0; i<4; ++i)
244  {
245  string tmp;
246  std::getline(lineStream,tmp, sep);
247  tmp = trim(tmp);
248  head_to_col[tmp] = i;
249  }
250  int nl=0;
251  int nfound=0;
252  int nskip=0;
253  cout << "reading " << path << "...\n";
254  while (std::getline(indata, line))
255  {
256  std::stringstream lineStream(line);
257  std::string sampleNo, value, time, name;
258 
259  vector<string> cols(4);
260  std::getline(lineStream, cols.at(0), sep);
261  std::getline(lineStream, cols.at(1), sep);
262  std::getline(lineStream, cols.at(2), sep);
263  std::getline(lineStream, cols.at(3), sep);
264 
265  cols.at(3) = trim(cols.at(3));
266 
267  sampleNo = cols.at(head_to_col.at("id"));
268  time = cols.at(head_to_col.at("date"));
269  value = cols.at(head_to_col.at("value"));
270  name = cols.at(head_to_col.at("name"));
271 
272  if(!firstKey.compare(""))
273  firstKey = name;
274 
275  int sID = std::stol(sampleNo);
276  // if the sample ID is to be included, store it
277  if(idSet.find(sID) != idSet.end())
278  {
279  // dataMap[variable-name][row-idx].value=value
280  // dataMap[variable-name][row-idx].time=time
281  for (const auto& loc : idLoc.at(sID))
282  {
283  dataMap[name][loc].first.push_back(std::stod(value));
284  dataMap[name][loc].second.push_back(std::stod(time));
285  }
286  /* } */
287  ++nfound;
288  }
289  else
290  {
291  ++nskip;
292  }
293  ++nl;
294  }
295  //cout << "read " << nl << " lines of " << path << "\n";
296  //cout << "stored " << nfound << " lines, skipped " << nskip << "\n";
297  // validate dataMap
298  // for each dataMap[name], there should be map names from 0 ... numSamples -1
299  for ( const auto &val: dataMap )
300  {
301  bool pass = true;
302  int numSamples = val.second.size();
303  for (int x = 0; x<numSamples; ++x)
304  {
305  if (val.second.find(x) == val.second.end())
306  {
308  + " not found (patient id = "
309  + std::to_string(locID.at(x)) + ") in " + val.first);
310  pass = false;
311  }
312  }
313  }
314  int numVars = dataMap.size();
315  /* cout << "numVars= " << numVars << "\n"; */
316 
317  for ( const auto &val: dataMap )
318  {
319  /* cout << "storing " << val.first << "\n"; */
320  int numSamples = val.second.size();
321  /* cout << "numSamples= " << numSamples << "\n"; */
322  /* cout << "dataMap[val.first].size(): " << dataMap[val.first].size() << "\n"; */
323  /* cout << "x: "; */
324  for(int x = 0; x < numSamples; ++x)
325  {
326  /* cout << x << ","; */
327  ArrayXf arr1 = Map<ArrayXf>(dataMap.at(val.first).at(x).first.data(),
328  dataMap.at(val.first).at(x).first.size());
329  ArrayXf arr2 = Map<ArrayXf>(dataMap.at(val.first).at(x).second.data(),
330  dataMap.at(val.first).at(x).second.size());
331  Z[val.first].first.push_back(arr1);
332  Z[val.first].second.push_back(arr2);
333  }
334  /* cout << "\n"; */
335  }
336 }
337 } // Util
338 } // FT
339 
#define THROW_LENGTH_ERROR(err)
Definition: error.h:32
#define THROW_RUNTIME_ERROR(err)
Definition: error.h:30
#define THROW_INVALID_ARGUMENT(err)
Definition: error.h:31
std::string trim(std::string str, const std::string &chars)
Definition: utils.cc:43
void load_longitudinal(const std::string &path, std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > &Z, char sep)
load longitudinal csv file into matrix.
Definition: io.cc:100
string PBSTR
Definition: utils.cc:14
void load_partial_longitudinal(const std::string &path, std::map< string, std::pair< vector< ArrayXf >, vector< ArrayXf > > > &Z, char sep, const vector< int > &idx)
load partial longitudinal csv file into matrix according to idx vector
Definition: io.cc:175
vector< char > find_dtypes(const MatrixXf &X)
determines data types of columns of matrix X.
Definition: utils.cc:49
int PBWIDTH
Definition: utils.cc:15
void printProgress(float percentage)
outputs a progress bar, filled according to
Definition: io.cc:15
std::string to_string(const T &value)
template function to convert objects to string for logging
Definition: utils.h:422
void load_csv(const std::string &path, MatrixXf &X, VectorXf &y, vector< string > &names, vector< char > &dtypes, bool &binary_endpoint, char sep)
load csv file into matrix.
Definition: io.cc:27
main Feat namespace
Definition: data.cc:13
int i
Definition: params.cc:552