Darwin  1.10(beta)
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Modules Pages
drwnDataset.h
Go to the documentation of this file.
1 /******************************************************************************
2 ** DARWIN: A FRAMEWORK FOR MACHINE LEARNING RESEARCH AND DEVELOPMENT
3 ** Distributed under the terms of the BSD license (see the LICENSE file)
4 ** Copyright (c) 2007-2017, Stephen Gould
5 ** All rights reserved.
6 **
7 ******************************************************************************
8 ** FILENAME: drwnDataset.h
9 ** AUTHOR(S): Stephen Gould <stephen.gould@anu.edu.au>
10 **
11 *****************************************************************************/
12 
13 #pragma once
14 
15 #include <iostream>
16 #include <fstream>
17 #include <sstream>
18 #include <vector>
19 #include <map>
20 
21 #include "drwnBase.h"
22 
23 using namespace std;
24 
26 
27 // drwnDataset --------------------------------------------------------------
41 
42 template <typename XType, typename YType, typename WType>
43 class drwnDataset {
44  public:
45  vector<vector<XType> > features;
46  vector<YType> targets;
47  vector<WType> weights;
48  vector<int> indexes;
49 
50  public:
52  drwnDataset();
56  drwnDataset(const char *filename);
57  ~drwnDataset();
58 
59  // dataset properties
61  inline bool empty() const { return features.empty(); }
63  inline int size() const { return (int)features.size(); }
65  inline bool hasWeights() const { return !weights.empty(); }
67  inline bool hasIndexes() const { return !indexes.empty(); }
70  inline bool valid() const;
71 
73  int count(const YType& label) const;
75  void reserve(int reserveSize);
77  inline int numFeatures() const;
79  inline YType minTarget() const;
81  inline YType maxTarget() const;
82 
83  // stored dataset properties (without loading)
85  static int size(const char *filename);
87  static int numFeatures(const char *filename);
89  static bool hasWeights(const char *filename);
91  static bool hasIndexes(const char *filename);
92 
93  // i/o
95  void clear();
97  int write(const char *filename, bool bAppend = false) const;
99  int write(const char *filename, int startIndx, int endIndx, bool bAppend = false) const;
101  int read(const char *filename, bool bAppend = false);
103  int read(const char *filename, int startIndx, int endIndx, bool bAppend = false);
104 
105  // modification
107  int append(const drwnDataset<XType, YType, WType>& d);
109  int append(const vector<XType>& x, const YType& y);
111  int append(const vector<XType>& x, const YType& y, const WType& w);
113  int append(const vector<XType>& x, const YType& y, const WType& w, int indx);
114 
118  int subSample(int sampleRate, bool bBalanced = false);
119 };
120 
121 // standard datasets --------------------------------------------------------
122 
127 
128 // implementation -----------------------------------------------------------
129 
130 template <typename XType, typename YType, typename WType>
132 {
133  // do nothing
134 }
135 
136 template <typename XType, typename YType, typename WType>
138  features(d.features), targets(d.targets), weights(d.weights), indexes(d.indexes) {
139  // do nothing
140 }
141 
142 template <typename XType, typename YType, typename WType>
144 {
145  read(filename);
146 }
147 
148 template <typename XType, typename YType, typename WType>
150 {
151  // do nothing
152 }
153 
154 template <typename XType, typename YType, typename WType>
156 {
157  size_t nFeatures = (size_t)numFeatures();
158  for (typename vector<vector<XType> >::const_iterator it = features.begin();
159  it != features.end(); it++) {
160  if (it->size() != nFeatures) return false;
161  }
162 
163  return (features.size() == targets.size()) &&
164  (weights.empty() || (weights.size() == targets.size())) &&
165  (indexes.empty() || (indexes.size() == targets.size()));
166 }
167 
168 template <typename XType, typename YType, typename WType>
169 int drwnDataset<XType, YType, WType>::count(const YType& label) const
170 {
171  int c = 0;
172  for (typename vector<YType>::const_iterator it = targets.begin();
173  it != targets.end(); it++) {
174  if (*it == label)
175  c += 1;
176  }
177 
178  return c;
179 }
180 
181 template <typename XType, typename YType, typename WType>
183 {
184  features.reserve(reserveSize);
185  targets.reserve(reserveSize);
186  weights.reserve(reserveSize);
187  indexes.reserve(reserveSize);
188 }
189 
190 template <typename XType, typename YType, typename WType>
192 {
193  return features.empty() ? 0 : (int)features[0].size();
194 }
195 
196 template <typename XType, typename YType, typename WType>
198 {
199  return targets.empty() ? (YType)0 :
200  *std::min_element(targets.begin(), targets.end());
201 }
202 
203 template <typename XType, typename YType, typename WType>
205 {
206  return targets.empty() ? (YType)0 :
207  *std::max_element(targets.begin(), targets.end());
208 }
209 
210 // stored dataset properties (without loading)
211 template <typename XType, typename YType, typename WType>
212 int drwnDataset<XType, YType, WType>::size(const char *filename)
213 {
214  DRWN_ASSERT(filename != NULL);
215 
216  // open file and read header
217  ifstream ifs(filename, ifstream::in | ifstream::binary);
218  if (ifs.fail()) {
219  DRWN_LOG_ERROR("could not find file " << filename);
220  return 0;
221  }
222 
223  if (ifs.eof()) {
224  DRWN_LOG_WARNING("empty file " << filename);
225  return 0;
226  }
227 
228  unsigned flags;
229  ifs.read((char *)&flags, sizeof(unsigned));
230  DRWN_ASSERT_MSG((flags & 0xffff0000) == 0x00010000, "unrecognized file version");
231 
232  int nFeatures;
233  ifs.read((char *)&nFeatures, sizeof(int));
234 
235  ifs.seekg(0, ios::end);
236  int len = (int)ifs.tellg() - 2 * sizeof(int);
237  ifs.close();
238 
239  // determine number of records
240  int bytesPerRecord = sizeof(YType) + nFeatures * sizeof(XType);
241  if ((flags & 0x00000001) == 0x00000001) bytesPerRecord += sizeof(WType);
242  if ((flags & 0x00000002) == 0x00000002) bytesPerRecord += sizeof(int);
243 
244  DRWN_ASSERT_MSG(len % bytesPerRecord == 0, "corrupt file " << filename
245  << " (len: " << len << ", bytes/record = " << bytesPerRecord << ")");
246  return (int)(len / bytesPerRecord);
247 }
248 
249 template <typename XType, typename YType, typename WType>
251 {
252  DRWN_ASSERT(filename != NULL);
253 
254  // open file and read header
255  ifstream ifs(filename, ifstream::in | ifstream::binary);
256  if (ifs.fail()) {
257  DRWN_LOG_ERROR("could not open file " << filename);
258  return 0;
259  }
260 
261  if (ifs.eof()) {
262  DRWN_LOG_WARNING("empty file " << filename);
263  return 0;
264  }
265 
266  unsigned flags;
267  ifs.read((char *)&flags, sizeof(unsigned));
268  DRWN_ASSERT_MSG((flags & 0xffff0000) == 0x00010000, "unrecognized file version");
269 
270  int nFeatures;
271  ifs.read((char *)&nFeatures, sizeof(int));
272  ifs.close();
273 
274  return nFeatures;
275 }
276 
277 template <typename XType, typename YType, typename WType>
279 {
280  DRWN_ASSERT(filename != NULL);
281 
282  // open file and read header
283  ifstream ifs(filename, ifstream::in | ifstream::binary);
284  if (ifs.fail()) {
285  DRWN_LOG_ERROR("could not open file " << filename);
286  return false;
287  }
288 
289  if (ifs.eof()) {
290  DRWN_LOG_WARNING("empty file " << filename);
291  return false;
292  }
293 
294  unsigned flags;
295  ifs.read((char *)&flags, sizeof(unsigned));
296  DRWN_ASSERT_MSG((flags & 0xffff0000) == 0x00010000, "unrecognized file version");
297 
298  return ((flags & 0x00000001) == 0x00000001);
299 }
300 
301 template <typename XType, typename YType, typename WType>
303 {
304  DRWN_ASSERT(filename != NULL);
305 
306  // open file and read header
307  ifstream ifs(filename, ifstream::in | ifstream::binary);
308  if (ifs.fail()) {
309  DRWN_LOG_ERROR("could not open file " << filename);
310  return false;
311  }
312 
313  if (ifs.eof()) {
314  DRWN_LOG_WARNING("empty file " << filename);
315  return false;
316  }
317 
318  unsigned flags;
319  ifs.read((char *)&flags, sizeof(unsigned));
320  DRWN_ASSERT_MSG((flags & 0xffff0000) == 0x00010000, "unrecognized file version");
321 
322  return ((flags & 0x00000002) == 0x00000002);
323 }
324 
325 // i/o
326 
327 template <typename XType, typename YType, typename WType>
329 {
330  features.clear();
331  targets.clear();
332  weights.clear();
333  indexes.clear();
334 }
335 
336 template <typename XType, typename YType, typename WType>
337 int drwnDataset<XType, YType, WType>::write(const char *filename, bool bAppend) const
338 {
339  if (this->empty()) return 0;
340  return write(filename, 0, this->size() - 1, bAppend);
341 }
342 
343 template <typename XType, typename YType, typename WType>
344 int drwnDataset<XType, YType, WType>::write(const char *filename, int startIndx, int endIndx, bool bAppend) const
345 {
346  DRWN_ASSERT(filename != NULL);
347  DRWN_ASSERT(this->valid());
348  DRWN_ASSERT_MSG((startIndx >= 0) && (endIndx < this->size()) && (startIndx <= endIndx),
349  "startIndx = " << startIndx << ", endIndx = " << endIndx << ", size() = " << this->size());
350 
351  // open file
352  unsigned flags = 0x00010000;
353  if (hasWeights()) flags |= 0x00000001;
354  if (hasIndexes()) flags |= 0x00000002;
355 
356  int nFeatures = numFeatures();
357  fstream ofs;
358  if (bAppend && drwnFileExists(filename)) {
359  unsigned fileFlags;
360  int fileNumFeatures;
361 
362  ofs.open(filename, ios::in | ios::out | ios::binary);
363  ofs.seekg(0, ios::beg);
364  ofs.read((char *)&fileFlags, sizeof(unsigned));
365  DRWN_ASSERT(fileFlags == flags);
366  ofs.read((char *)&fileNumFeatures, sizeof(int));
367  DRWN_ASSERT(fileNumFeatures == nFeatures);
368 
369  ofs.seekp(0, ios::end);
370  } else {
371  ofs.open(filename, ios::out | ios::binary);
372  ofs.write((char *)&flags, sizeof(unsigned));
373  ofs.write((char *)&nFeatures, sizeof(int));
374  }
375  DRWN_ASSERT_MSG(!ofs.fail(), filename);
376 
377  // write data
378  for (int i = startIndx; i <= endIndx ; i++) {
379  ofs.write((char *)&targets[i], sizeof(YType));
380  ofs.write((char *)&features[i][0], nFeatures * sizeof(XType));
381  if (!weights.empty()) {
382  ofs.write((char *)&weights[i], sizeof(WType));
383  }
384  if (!indexes.empty()) {
385  ofs.write((char *)&indexes[i], sizeof(int));
386  }
387  }
388 
389  int len = (int)ofs.tellp() - 2 * sizeof(int);
390  ofs.close();
391 
392  int bytesPerRecord = sizeof(YType) + nFeatures * sizeof(XType);
393  if (hasWeights()) bytesPerRecord += sizeof(WType);
394  if (hasIndexes()) bytesPerRecord += sizeof(int);
395 
396  DRWN_ASSERT_MSG(len % bytesPerRecord == 0, "corrupt file " << filename
397  << " (len: " << len << ", bytes/record = " << bytesPerRecord << ")");
398  return (int)(len / bytesPerRecord);
399 }
400 
401 template <typename XType, typename YType, typename WType>
402 int drwnDataset<XType, YType, WType>::read(const char *filename, bool bAppend)
403 {
404  return read(filename, 0, numeric_limits<int>::max(), bAppend);
405 }
406 
407 template <typename XType, typename YType, typename WType>
408 int drwnDataset<XType, YType, WType>::read(const char *filename, int startIndx, int endIndx, bool bAppend)
409 {
410  DRWN_ASSERT(filename != NULL);
411  DRWN_ASSERT((startIndx >= 0) && (endIndx >= startIndx));
412  if (!bAppend) clear();
413 
414  // open file
415  ifstream ifs(filename, ifstream::in | ifstream::binary);
416  if (ifs.fail()) {
417  DRWN_LOG_ERROR("could not find file " << filename);
418  return size();
419  }
420 
421  if (ifs.eof()) {
422  DRWN_LOG_WARNING("empty file " << filename);
423  return size();
424  }
425 
426  unsigned flags;
427  ifs.read((char *)&flags, sizeof(unsigned));
428  DRWN_ASSERT_MSG((flags & 0xffff0000) == 0x00010000, "unrecognized file version: " << flags);
429  DRWN_ASSERT(empty() || ((flags & 0x00000001) == (hasWeights() ? 0x00000001 : 0x00000000)));
430  DRWN_ASSERT(empty() || ((flags & 0x00000002) == (hasIndexes() ? 0x00000002 : 0x00000000)));
431 
432  int nFeatures;
433  ifs.read((char *)&nFeatures, sizeof(int));
434  DRWN_ASSERT_MSG(empty() || (nFeatures == numFeatures()), nFeatures << " != " << numFeatures());
435 
436  int bytesPerRecord = sizeof(YType) + nFeatures * sizeof(XType);
437  if ((flags & 0x00000001) == 0x00000001) bytesPerRecord += sizeof(WType);
438  if ((flags & 0x00000002) == 0x00000002) bytesPerRecord += sizeof(int);
439 
440  // goto start index
441  ifs.seekg(startIndx * bytesPerRecord, ios::cur);
442  if (ifs.fail()) {
443  ifs.close();
444  DRWN_LOG_WARNING("less than " << startIndx << " record in file " << filename);
445  return size();
446  }
447 
448  // read until end of file or end index
449  YType y;
450  vector<XType> x(nFeatures);
451  WType w;
452  int index;
453 
454  int recordCount = startIndx;
455  while (recordCount <= endIndx) {
456  ifs.read((char *)&y, sizeof(YType));
457  ifs.read((char *)&x[0], nFeatures * sizeof(XType));
458  if ((flags & 0x00000001) == 0x00000001) {
459  ifs.read((char *)&w, sizeof(WType));
460  }
461  if ((flags & 0x00000002) == 0x00000002) {
462  ifs.read((char *)&index, sizeof(int));
463  }
464 
465  if (ifs.fail()) break;
466  targets.push_back(y);
467  features.push_back(x);
468  if ((flags & 0x00000001) == 0x00000001) {
469  weights.push_back(w);
470  }
471  if ((flags & 0x00000002) == 0x00000002) {
472  indexes.push_back(index);
473  }
474 
475  recordCount += 1;
476  }
477 
478  // close file
479  ifs.close();
480 
481  return size();
482 }
483 
484 // modification
485 template <typename XType, typename YType, typename WType>
487 {
488  if (d.empty()) return size();
489  if (empty()) {
490  features = d.features();
491  targets = d.targets();
492  weights = d.weights();
493  indexes = d.indexes();
494  return size();
495  }
496 
497  DRWN_ASSERT(d.numFeatures() == numFeatures());
498  DRWN_ASSERT(d.hasWeights() == hasWeights());
499  DRWN_ASSERT(d.hasIndexes() == hasIndexes());
500 
501  features.insert(features.end(), d.features.begin(), d.features.end());
502  targets.insert(targets.end(), d.targets.begin(), d.targets.end());
503  if (hasWeights()) {
504  weights.insert(weights.end(), d.weights.begin(), d.weights.end());
505  }
506  if (hasIndexes()) {
507  indexes.insert(indexes.end(), d.indexes.begin(), d.indexes.end());
508  }
509 
510  return size();
511 }
512 
513 template <typename XType, typename YType, typename WType>
514 int drwnDataset<XType, YType, WType>::append(const vector<XType>& x, const YType& y)
515 {
516  if (!empty()) {
517  DRWN_ASSERT(!hasWeights() && !hasIndexes());
518  DRWN_ASSERT((int)x.size() == numFeatures());
519  }
520 
521  features.push_back(x);
522  targets.push_back(y);
523 
524  return size();
525 }
526 
527 template <typename XType, typename YType, typename WType>
528 int drwnDataset<XType, YType, WType>::append(const vector<XType>& x, const YType& y, const WType& w)
529 {
530  if (!empty()) {
531  DRWN_ASSERT(hasWeights() && !hasIndexes());
532  DRWN_ASSERT((int)x.size() == numFeatures());
533  }
534 
535  features.push_back(x);
536  targets.push_back(y);
537  weights.push_back(w);
538 
539  return size();
540 }
541 
542 
543 template <typename XType, typename YType, typename WType>
544 int drwnDataset<XType, YType, WType>::append(const vector<XType>& x, const YType& y, const WType& w, int indx)
545 {
546  if (!empty()) {
547  DRWN_ASSERT(hasWeights() && hasIndexes());
548  DRWN_ASSERT((int)x.size() == numFeatures());
549  }
550 
551  features.push_back(x);
552  targets.push_back(y);
553  weights.push_back(w);
554  indexes.push_back(indx);
555 
556  return size();
557 }
558 
559 template <typename XType, typename YType, typename WType>
560 int drwnDataset<XType, YType, WType>::subSample(int sampleRate, bool bBalanced)
561 {
562  DRWN_ASSERT_MSG(sampleRate > 0, "sampleRate must be greater than one");
565 
566  // construct random permutation of indices
567  vector<int> indx = drwn::randomPermutation(features.size());
568 
569  if (bBalanced) {
570  map<YType, vector<int> > stratified;
571  for (size_t i = 0; i < indx.size(); i++) {
572  typename map<YType, vector<int> >::iterator it = stratified.find(targets[indx[i]]);
573  if (it == stratified.end()) {
574  stratified.insert(make_pair(targets[indx[i]], vector<int>(1, indx[i])));
575  } else {
576  it->second.push_back(indx[i]);
577  }
578  }
579 
580  size_t maxSamplesPerTarget = 1;
581  for (typename map<YType, vector<int> >::const_iterator it = stratified.begin();
582  it != stratified.end(); ++it) {
583  maxSamplesPerTarget = std::max(maxSamplesPerTarget, it->second.size());
584  }
585  maxSamplesPerTarget = (maxSamplesPerTarget + sampleRate - 1) / sampleRate;
586 
587  // reconstruct indx vector
588  indx.clear();
589  for (typename map<YType, vector<int> >::iterator it = stratified.begin();
590  it != stratified.end(); ++it) {
591  if (it->second.size() > maxSamplesPerTarget) {
592  it->second.resize(maxSamplesPerTarget);
593  }
594  indx.insert(indx.end(), it->second.begin(), it->second.end());
595  }
596 
597  } else {
598  // resize indx vector to first n samples
599  indx.resize((features.size() + sampleRate - 1) / sampleRate);
600  }
601 
602  // construct new samples according to permutation
603  vector<vector<XType> > nfeatures(indx.size());
604  vector<YType> ntargets(nfeatures.size());
605  vector<WType> nweights(hasWeights() ? nfeatures.size() : 0);
606  vector<int> nindexes(hasIndexes() ? nfeatures.size() : 0);
607 
608  for (size_t i = 0; i < nfeatures.size(); i++) {
609  std::swap(nfeatures[i], features[indx[i]]);
610  std::swap(ntargets[i], targets[indx[i]]);
611  if (!nweights.empty()) {
612  std::swap(nweights[i], weights[indx[i]]);
613  }
614  if (!nindexes.empty()) {
615  nindexes[i] = indexes[indx[i]];
616  }
617  }
618 
619  std::swap(features, nfeatures);
620  std::swap(targets, ntargets);
621  std::swap(weights, nweights);
622  std::swap(indexes, nindexes);
623 
624  return (int)indx.size();
625 }
vector< WType > weights
weights (optional)
Definition: drwnDataset.h:47
int read(const char *filename, bool bAppend=false)
reads a dataset from disk (optionally appending to the current dataset)
Definition: drwnDataset.h:402
YType minTarget() const
returns the minimum target value in the dataset
Definition: drwnDataset.h:197
drwnDataset< double, double, double > drwnRegressionDataset
standard dataset for supervised regression algorithms
Definition: drwnDataset.h:126
int write(const char *filename, bool bAppend=false) const
writes the current dataset to disk (optionally appending to an existing dataset)
Definition: drwnDataset.h:337
void reserve(int reserveSize)
pre-allocate memory for storing samples (feature vectors and targets)
Definition: drwnDataset.h:182
void clear()
clears all data in the dataset
Definition: drwnDataset.h:328
vector< YType > targets
target labels
Definition: drwnDataset.h:46
vector< vector< XType > > features
feature vectors
Definition: drwnDataset.h:45
bool drwnFileExists(const char *filename)
checks if a file exists
Definition: drwnFileUtils.cpp:323
YType maxTarget() const
returns the maximum target value in the dataset
Definition: drwnDataset.h:204
int count(const YType &label) const
returns the number of samples with a given target label
Definition: drwnDataset.h:169
drwnDataset()
default constructor
Definition: drwnDataset.h:131
int append(const drwnDataset< XType, YType, WType > &d)
appends the samples from another dataset to this dataset
Definition: drwnDataset.h:486
vector< int > indexes
external indices (optional)
Definition: drwnDataset.h:48
int size() const
return the number of samples in the dataset
Definition: drwnDataset.h:63
int numFeatures() const
returns the number of features in the feature vector
Definition: drwnDataset.h:191
drwnDataset< double, int, double > drwnClassifierDataset
standard dataset for supervised classification algorithms
Definition: drwnDataset.h:124
bool empty() const
return true if the dataset is empty
Definition: drwnDataset.h:61
int subSample(int sampleRate, bool bBalanced=false)
subsample a dataset (balanced is only valid for discrete target types) if bBalanced is true then samp...
Definition: drwnDataset.h:560
bool valid() const
return true if the dataset is valid (e.g., number of targets equals number of feature vectors) ...
Definition: drwnDataset.h:155
Implements a cacheable dataset containing feature vectors, labels and optional weights.
Definition: drwnDataset.h:43
bool hasIndexes() const
return true if the dataset has external indices associated with each sample
Definition: drwnDataset.h:67
bool hasWeights() const
return true if the dataset contains weighted samples
Definition: drwnDataset.h:65