Darwin  1.10(beta)
drwnStatsUtils.h
Go to the documentation of this file.
1 /*****************************************************************************
2 ** DARWIN: A FRAMEWORK FOR MACHINE LEARNING RESEARCH AND DEVELOPMENT
3 ** Distributed under the terms of the BSD license (see the LICENSE file)
4 ** Copyright (c) 2007-2017, Stephen Gould
5 ** All rights reserved.
6 **
7 ******************************************************************************
8 ** FILENAME: drwnStatsUtils.h
9 ** AUTHOR(S): Stephen Gould <stephen.gould@anu.edu.au>
10 **
11 *****************************************************************************/
12 
19 #pragma once
20 
21 #include <cassert>
22 #include <vector>
23 #include <set>
24 #include <map>
25 #include <algorithm>
26 #include <limits>
27 #include <math.h>
28 
29 #include "Eigen/Core"
30 using namespace Eigen;
31 
32 #if defined(_WIN32)||defined(WIN32)||defined(__WIN32__)
33 #undef min
34 #undef max
35 #endif
36 
37 using namespace std;
38 
41 void drwnInitializeRand();
42 
43 namespace drwn {
44 
46  template <typename T>
47  inline T minElem(const vector<T>& v);
48 
50  template <typename T>
51  inline T maxElem(const vector<T>& v);
52 
54  template <typename T>
55  T mean(const vector<T>& v);
56 
58  template <typename T>
59  T median(const vector<T>& v);
60 
63  template <typename T>
64  T destructive_median(vector<T>& w);
65 
67  template <typename T>
68  T mode(const vector<T>& v);
69 
71  template <typename T>
72  T variance(const vector<T>& v);
73 
75  template <typename T>
76  T stdev(const vector<T>& v);
77 
79  template <typename T>
80  int argmin(const vector<T>& v);
81 
83  int argmin(const VectorXd &v);
84 
86  template <typename T>
87  vector<int> argmins(const vector<vector<T> >& v);
88 
90  template <typename T>
91  int argmax(const vector<T>& v);
92 
94  int argmax(const VectorXd &v);
95 
97  template <typename T>
98  vector<int> argmaxs(const vector<vector<T> >& v);
99 
102  int argrand(const vector<double>& v);
105  int argrand(const VectorXd &v);
106 
108  template <typename T>
109  T excessKurtosis(const vector<T> &v);
110 
111  template <typename T>
112  vector<float> percentiles(const vector<T> & v);
113 
115  template <typename T>
116  pair<T, T> range(const vector<T>& v);
117 
119  template <typename T>
120  pair<T, T> range(const vector<vector<T> >& v);
121 
123  template <typename T>
124  vector<T> extractSubVector(const vector<T>& v, const vector<int>& indx);
125 
127  template <typename T>
128  vector<T> removeOutliers(const vector<T>& v,
129  const vector<double>& scores, int keepSize);
130 
132  template <typename T>
133  set<set<T> > powerset(const set<T>& s);
134 
136  int roundUp(int n, int d);
137 
139  bool containsInvalidEntries(const vector<double> &v);
140 
142  double logistic(const vector<double>& theta, const vector<double>& data);
144  double logistic(const double *theta, const double *data, int n);
145 
147  double entropy(const std::vector<double>& p);
149  double entropy(const std::vector<int>& counts);
150 
152  double gini(const std::vector<double>& p);
154  double gini(const std::vector<int>& p);
155 
157  double expAndNormalize(std::vector<double>& v);
159  double expAndNormalize(VectorXd& v);
160 
162  inline double fastexp(double x);
163 
165  vector<int> randomPermutation(int n);
166 
168  template <typename T>
169  void shuffle(vector<T>& v);
170 
172  template <typename T>
173  vector<T> subSample(const vector<T>& v, size_t n);
174 
176  vector<double> linSpaceVector(double startValue, double endValue, unsigned n = 10);
178  vector<double> logSpaceVector(double startValue, double endValue, unsigned n = 10);
179 
183  void predecessor(std::vector<int>& array, int limit);
187  void successor(std::vector<int>& array, int limit);
191  void predecessor(std::vector<int>& array, const std::vector<int>& limits);
195  void successor(std::vector<int>& array, const std::vector<int>& limits);
196 
199  inline double huberFunction(double x, double m = 1.0);
201  inline double huberDerivative(double x, double m = 1.0);
203  inline double huberFunctionAndDerivative(double x, double *df, double m = 1.0);
204 
207  double bhattacharyyaDistance(std::vector<double>& p, std::vector<double>& q);
210  double euclideanDistanceSq(std::vector<double>& p, std::vector<double>& q);
211 
213  double sum(const vector<double> &v);
215  double sum(const double *v, size_t length);
216 
218  double dot(const double *x, const double *y, size_t length);
220  double dot(const vector<double>& x, const vector<double>& y);
221 
223  bool eq(const double x, const double y);
225  bool lt(const double x, const double y);
226 };
227 
228 // Implementation -----------------------------------------------------------
229 
230 template <typename T>
231 T drwn::minElem(const vector<T> & v)
232 {
233  switch (v.size()) {
234  case 0: DRWN_LOG_FATAL("invalid size"); break;
235  case 1: return v.front(); break;
236  case 2: return std::min(v.front(), v.back()); break;
237  }
238 
239  T minObj(v.front());
240  for (typename vector<T>::const_iterator i = v.begin() + 1; i != v.end(); ++i) {
241  minObj = std::min(minObj, *i);
242  }
243 
244  return minObj;
245 }
246 
247 template <typename T>
248 T drwn::maxElem(const vector<T> & v)
249 {
250  switch (v.size()) {
251  case 0: DRWN_LOG_FATAL("invalid size"); break;
252  case 1: return v.front(); break;
253  case 2: return std::max(v.front(), v.back()); break;
254  }
255 
256  T maxObj(v.front());
257  for (typename vector<T>::const_iterator i = v.begin() + 1; i != v.end(); ++i) {
258  maxObj = std::max(maxObj, *i);
259  }
260 
261  return maxObj;
262 }
263 
264 template <typename T>
265 T drwn::mean(const vector<T>& v)
266 {
267  DRWN_ASSERT(v.size() > 0);
268 
269  T sum(0);
270 
271  for (typename vector<T>::const_iterator i = v.begin(); i != v.end(); ++i) {
272  sum += *i;
273  }
274 
275  return sum / T(v.size());
276 }
277 
278 template <typename T>
279 T drwn::median(const vector<T>& v)
280 {
281  DRWN_ASSERT(v.size() > 0);
282 
283  vector<T> w(v);
284  if (w.size() % 2 == 1) {
285  int ix = w.size() / 2;
286  nth_element(w.begin(), w.begin()+ix, w.end());
287  return w[ix];
288  } else {
289  // Get superior and inferior middle elements.
290  int ix_sup = w.size()/2;
291  nth_element(w.begin(), w.begin() + ix_sup, w.end());
292  nth_element(w.begin(), w.begin() + ix_sup - 1, w.begin()+ ix_sup);
293  return T(0.5 * ( w[ix_sup] + w[ix_sup-1] ));
294  }
295 }
296 
297 template <typename T>
299 {
300  DRWN_ASSERT(w.size() > 0);
301  if (w.size() % 2 == 1) {
302  int ix = w.size() / 2;
303  nth_element(w.begin(), w.begin()+ix, w.end());
304  return w[ix];
305  } else {
306  // Get superior and inferior middle elements.
307  int ix_sup = w.size()/2;
308  nth_element(w.begin(), w.begin() + ix_sup, w.end());
309  nth_element(w.begin(), w.begin() + ix_sup - 1, w.begin()+ ix_sup);
310  return T(0.5 * ( w[ix_sup] + w[ix_sup-1] ));
311  }
312 }
313 
314 template <typename T>
315 T drwn::mode (const vector<T>& v)
316 {
317  DRWN_ASSERT(v.size() > 0);
318  map<T, int> w;
319  int maxCount = -1;
320  typename vector<T>::const_iterator modeElement = v.begin();
321  for (typename vector<T>::const_iterator it = v.begin(); it != v.end(); it++) {
322  typename map<T, int>::iterator jt = w.find(*it);
323  if (jt == w.end()) {
324  jt = w.insert(w.end(), make_pair(*it, 0));
325  } else {
326  jt->second += 1;
327  }
328 
329  if (jt->second > maxCount) {
330  modeElement = it;
331  }
332  }
333 
334  return *modeElement;
335 }
336 
337 template <typename T>
338 T drwn::variance(const vector<T> & v)
339 {
340  DRWN_ASSERT(v.size() > 0);
341 
342  T mu = mean(v);
343  T sum(0);
344 
345  for (typename vector<T>::const_iterator i = v.begin(), last = v.end(); i != last; ++i) {
346  double dev = *i - mu;
347  sum += dev * dev;
348  }
349 
350  return sum / T(v.size());
351 }
352 
353 template <typename T>
354 T drwn::stdev(const vector<T> &v)
355 {
356  T std2 = variance(v);
357  return (std2 > 0.0 ? sqrt(std2) : 0.0);
358 }
359 
360 template <typename T>
361 int drwn::argmin(const vector<T> & v)
362 {
363  int minIndx;
364 
365  switch (v.size()) {
366  case 0: minIndx = -1; break;
367  case 1: minIndx = 0; break;
368  case 2: minIndx = (v[0] <= v[1]) ? 0 : 1; break;
369  default:
370  {
371  minIndx = 0;
372  for (int i = 1; i < (int)v.size(); i++) {
373  if (v[i] < v[minIndx]) {
374  minIndx = i;
375  }
376  }
377  }
378  }
379 
380  return minIndx;
381 }
382 
383 template <typename T>
384 vector<int> drwn::argmins(const vector<vector<T> >& v)
385 {
386  vector<int> minIndx(v.size(), -1);
387  for (int i = 0; i < (int)v.size(); i++) {
388  minIndx[i] = argmin(v[i]);
389  }
390 
391  return minIndx;
392 }
393 
394 template <typename T>
395 int drwn::argmax(const vector<T> & v)
396 {
397  int maxIndx;
398 
399  switch (v.size()) {
400  case 0: maxIndx = -1; break;
401  case 1: maxIndx = 0; break;
402  case 2: maxIndx = (v[0] >= v[1]) ? 0 : 1; break;
403  default:
404  {
405  maxIndx = 0;
406  for (int i = 1; i < (int)v.size(); i++) {
407  if (v[i] > v[maxIndx]) {
408  maxIndx = i;
409  }
410  }
411  }
412  }
413 
414  return maxIndx;
415 }
416 
417 template <typename T>
418 vector<int> drwn::argmaxs(const vector<vector<T> >& v)
419 {
420  vector<int> maxIndx(v.size(), -1);
421  for (int i = 0; i < (int)v.size(); i++) {
422  maxIndx[i] = argmax(v[i]);
423  }
424 
425  return maxIndx;
426 }
427 
428 template <typename T>
429 T drwn::excessKurtosis(const vector<T> & v)
430 {
431  DRWN_ASSERT(!v.empty());
432 
433  T mu = mean(v);
434  T sigma_squared = variance(v);
435 
436  T sum(0);
437  for (typename vector<T>::const_iterator i = v.begin(), last = v.end(); i != last; ++i) {
438  double dev = *i - mu;
439  double sqDev = dev * dev;
440  sum += sqDev * sqDev;
441  }
442 
443  return sum / ( T(v.size() * sigma_squared * sigma_squared)) - 3.0;
444 }
445 
446 template <typename T>
447 vector<float> drwn::percentiles(const vector<T> &v)
448 {
450  vector<float> rval;
451  for (int i = 0; i < v.size(); i++) {
452  int sum = 0;
453  for (int j = 0; j < v.size(); j++) {
454  if (v[j] < v[i])
455  sum++;
456  }
457  rval.push_back(float(sum)/float(v.size()));
458  }
459  return rval;
460 }
461 
462 template <typename T>
463 pair<T, T> drwn::range(const vector<T>& v)
464 {
465  DRWN_ASSERT(v.size() > 0);
466 
467  typename vector<T>::const_iterator minObj(v.begin());
468  typename vector<T>::const_iterator maxObj(v.begin());
469  for (typename vector<T>::const_iterator i = v.begin() + 1;
470  i != v.end(); ++i) {
471  if (*i < *minObj) minObj = i;
472  if (*i > *maxObj) maxObj = i;
473  }
474 
475  return make_pair(*minObj, *maxObj);
476 }
477 
478 template <typename T>
479 pair<T, T> drwn::range(const vector<vector<T> >& v)
480 {
481  DRWN_ASSERT(v.size() > 0);
482 
483  pair<T, T> r = range(*v.begin());
484  for (typename vector<vector<T> >::const_iterator i = v.begin() + 1;
485  i != v.end(); ++i) {
486  pair<T, T> ri = range(*i);
487  if (ri.first < r.first)
488  r.first = ri.first;
489  if (ri.second > r.second)
490  r.second = ri.second;
491  }
492 
493  return r;
494 }
495 
496 template <typename T>
497 vector<T> drwn::extractSubVector(const vector<T>& v, const vector<int>& indx)
498 {
499  vector<T> w;
500 
501  w.reserve(indx.size());
502  for (vector<int>::const_iterator it = indx.begin(); it != indx.end(); ++it) {
503  w.push_back(v[*it]);
504  }
505 
506  return w;
507 }
508 
509 template <typename T>
510 vector<T> drwn::removeOutliers(const vector<T>& v,
511  const vector<double>& scores, int keepSize)
512 {
513  DRWN_ASSERT(scores.size() == v.size());
514  if (keepSize >= (int)v.size()) {
515  return v;
516  }
517 
518  // sort scores
519  vector<pair<double, int> > indx(v.size());
520  for (unsigned i = 0; i < v.size(); i++) {
521  indx[i] = make_pair(scores[i], i);
522  }
523  sort(indx.begin(), indx.end());
524 
525  vector<T> w(keepSize);
526  unsigned startIndx = (v.size() - keepSize) / 2;
527  unsigned endIndx = startIndx + keepSize;
528  for (unsigned i = startIndx; i < endIndx; i++) {
529  w[i - startIndx] = v[indx[i].second];
530  }
531 
532  return w;
533 }
534 
535 template <typename T>
536 set<set<T> > drwn::powerset(const set<T>& s)
537 {
538  set<set<T> > result;
539 
540  if (s.empty()) {
541  result.insert(set<T>());
542  } else {
543  for (typename set<T>::const_iterator it = s.begin(); it != s.end(); ++it) {
544  T elem = *it;
545 
546  // copy the original set, and delete one element from it
547  set<T> smallS(s);
548  smallS.erase(elem);
549 
550  // compute the power set of this smaller set
551  set<set<T> > smallP = powerset(smallS);
552  result.insert(smallP.begin(), smallP.end());
553 
554  // add the deleted element to each member of this power set,
555  // and insert each new set
556  for (typename set<set<T> >::const_iterator jt = smallP.begin();
557  jt != smallP.end(); ++jt) {
558  set<T> next = *jt;
559  next.insert(elem);
560  result.insert(next);
561  }
562  }
563  }
564 
565  return result;
566 }
567 
568 // fastexp -------------------------------------------------------------------
569 // Nicol N. Schraudolph, "A Fast, Compact Approximation of the Exponential Function",
570 // in Neural Computation, 11,853-862 (1999).
571 
572 #define EXP_A (1048576.0 / M_LN2)
573 #define EXP_C 60801
574 
575 #ifndef M_LN2
576 #define M_LN2 0.69314718055994530942
577 #endif
578 
579 inline double drwn::fastexp(double y)
580 {
581  if (y < -700.0) return 0.0;
582 
583  union
584  {
585  double d;
586 #ifdef LITTLE_ENDIAN
587  struct { int j, i; } n;
588 #else
589  struct { int i, j; } n;
590 #endif
591  } _eco;
592  _eco.n.i = (int)(EXP_A * (y)) + (1072693248 - EXP_C);
593  _eco.n.j = 0;
594  return _eco.d;
595 }
596 
597 template <typename T>
598 void drwn::shuffle(vector<T>& v)
599 {
600  const size_t n = v.size();
601  if (n < 2) return;
602  for (size_t i = 0; i < n - 1; i++) {
603  size_t j = rand() % (n - i);
604  std::swap(v[i], v[i + j]);
605  }
606 }
607 
608 template <typename T>
609 vector<T> drwn::subSample(const vector<T>& v, size_t n)
610 {
611  if (n >= v.size()) return v;
612  if (n == 0) return vector<T>();
613 
614  // make a copy of v
615  vector<T> w(v);
616 
617  // randomly swap entries
618  for (size_t i = 0; i < n; i++) {
619  size_t j = rand() % (w.size() - i);
620  std::swap(w[i], w[i + j]);
621  }
622 
623  // resize w to only keep first n and return
624  w.resize(n);
625  return w;
626 }
627 
628 inline double drwn::huberFunction(double x, double m)
629 {
630  if (x < -m) return (m * (-2.0 * x - m));
631  if (x > m) return (m * (2.0 * x - m));
632 
633  return x * x;
634 }
635 
636 inline double drwn::huberDerivative(double x, double m)
637 {
638  if (x < -m) return -2.0 * m;
639  if (x > m) return 2.0 * m;
640 
641  return 2.0 * x;
642 }
643 
644 inline double drwn::huberFunctionAndDerivative(double x, double *df, double m)
645 {
646  if (x < -m) {
647  *df = -2.0 * m;
648  return (m * (-2.0 * x - m));
649  } else if (x > m) {
650  *df = 2.0 * m;
651  return (m * (2.0 * x - m));
652  } else {
653  *df = 2.0 * x;
654  return x * x;
655  }
656 }
657 
658 inline int drwn::roundUp(int n, int d) {
659  return (n % d == 0) ? n : n + d - (n % d);
660 }
double logistic(const vector< double > &theta, const vector< double > &data)
logistic function
Definition: drwnStatsUtils.cpp:61
vector< T > removeOutliers(const vector< T > &v, const vector< double > &scores, int keepSize)
removes (v.size() - keepSize)/2 minimum and maximum entries
Definition: drwnStatsUtils.h:510
int argmin(const VectorXd &v)
returns the index of the smallest element in a vector of objects
Definition: drwnStatsUtils.cpp:399
T stdev(const vector< T > &v)
returns the standard deviation of all elements in a vector of objects
Definition: drwnStatsUtils.h:354
pair< T, T > range(const vector< T > &v)
returns the minimum and maximum values in a vector of objects
Definition: drwnStatsUtils.h:463
vector< T > extractSubVector(const vector< T > &v, const vector< int > &indx)
select an ordered subvector from a vector
Definition: drwnStatsUtils.h:497
T destructive_median(vector< T > &w)
returns the median element in a vector of objects (but may modify the vector&#39;s contents) ...
Definition: drwnStatsUtils.h:298
T mean(const vector< T > &v)
returns the mean of all elements in a vector of objects
Definition: drwnStatsUtils.h:265
vector< int > argmaxs(const vector< vector< T > > &v)
returns the index for the largest element in each of vector of vector of objects
Definition: drwnStatsUtils.h:418
set< set< T > > powerset(const set< T > &s)
generate powerset of a set
Definition: drwnStatsUtils.h:536
pair< T, T > range(const vector< vector< T > > &v)
returns the minimum and maximum values in a vector of vector of objects
Definition: drwnStatsUtils.h:479
Definition: acknowledgments.dox:1
vector< float > percentiles(const vector< T > &v)
Definition: drwnStatsUtils.h:447
int argmax(const VectorXd &v)
returns the index of the largest element in a vector of objects
Definition: drwnStatsUtils.cpp:413
vector< double > linSpaceVector(double startValue, double endValue, unsigned n=10)
generate a vector of linearly-spaced values from startValue to endValue
Definition: drwnStatsUtils.cpp:209
T excessKurtosis(const vector< T > &v)
returns the kurtosis for a vector of objects
Definition: drwnStatsUtils.h:429
int argmax(const vector< T > &v)
returns the index of the largest element in a vector of objects
Definition: drwnStatsUtils.h:395
double euclideanDistanceSq(std::vector< double > &p, std::vector< double > &q)
Computes the Euclidean norm between two discrete probability distributions. The distributions do not ...
Definition: drwnStatsUtils.cpp:320
double huberFunction(double x, double m=1.0)
huber penalty function, for and otherwise
Definition: drwnStatsUtils.h:628
double expAndNormalize(std::vector< double > &v)
exponentiates and normalizes a vector in-place; returns log of the normalization constant ...
Definition: drwnStatsUtils.cpp:159
T variance(const vector< T > &v)
returns the variance (second moment about the mean) of all elements in a vector of objects ...
Definition: drwnStatsUtils.h:338
vector< int > argmins(const vector< vector< T > > &v)
returns the index for the smallest element in each of vector of vector of objects ...
Definition: drwnStatsUtils.h:384
vector< double > logSpaceVector(double startValue, double endValue, unsigned n=10)
generate a vector of logarithmically-spaced values from startValue to endValue
Definition: drwnStatsUtils.cpp:227
double dot(const double *x, const double *y, size_t length)
dot product between elements in two vectors
Definition: drwnStatsUtils.cpp:346
void successor(std::vector< int > &array, int limit)
Computes the successor of a discrete vector, for example, successor([1 0 0], 2) produces [0 1 0]...
Definition: drwnStatsUtils.cpp:260
T minElem(const vector< T > &v)
returns the minimum element in a vector of objects
Definition: drwnStatsUtils.h:231
vector< T > subSample(const vector< T > &v, size_t n)
extract a subsample from a vector of size n
Definition: drwnStatsUtils.h:609
vector< int > randomPermutation(int n)
compute a random permutation of the numbers [0..n-1]
Definition: drwnStatsUtils.cpp:188
double sum(const vector< double > &v)
sum the elements in a vector
Definition: drwnStatsUtils.cpp:330
void predecessor(std::vector< int > &array, int limit)
Computes the predecessor of a discrete vector, for example, predecessor([1 0 0], 2) produces [0 0 0]...
Definition: drwnStatsUtils.cpp:246
bool eq(const double x, const double y)
whether two numbers are equal
Definition: drwnStatsUtils.cpp:447
double huberFunctionAndDerivative(double x, double *df, double m=1.0)
huber penalty function and derivative at x
Definition: drwnStatsUtils.h:644
double bhattacharyyaDistance(std::vector< double > &p, std::vector< double > &q)
Computes the Bhattacharyya distance between two discrete probability distributions. The distributions do not need to be normalized.
Definition: drwnStatsUtils.cpp:302
double fastexp(double x)
fast exponentiation
Definition: drwnStatsUtils.h:579
double huberDerivative(double x, double m=1.0)
derivative of huberFunction at x
Definition: drwnStatsUtils.h:636
void shuffle(vector< T > &v)
randomly permutes the entries of a vector inline
Definition: drwnStatsUtils.h:598
bool lt(const double x, const double y)
whether first number is less than second number
Definition: drwnStatsUtils.cpp:452
T mode(const vector< T > &v)
returns the most frequent element in a vector of objects
Definition: drwnStatsUtils.h:315
T maxElem(const vector< T > &v)
returns the maximum element in a vector of objects
Definition: drwnStatsUtils.h:248
void drwnInitializeRand()
initialize the standard C library random number generator with a time-of-day seed ...
Definition: drwnStatsUtils.cpp:34
int argmin(const vector< T > &v)
returns the index of the smallest element in a vector of objects
Definition: drwnStatsUtils.h:361
bool containsInvalidEntries(const vector< double > &v)
returns true if the vector contains NaN or Inf values
Definition: drwnStatsUtils.cpp:45
T median(const vector< T > &v)
returns the median element in a vector of objects
Definition: drwnStatsUtils.h:279
int roundUp(int n, int d)
rounds (away from zero) to nearest discretization
Definition: drwnStatsUtils.h:658
double entropy(const std::vector< double > &p)
computes the entropy of a possibly unnormalized distribution
Definition: drwnStatsUtils.cpp:86
int argrand(const vector< double > &v)
returns the index for a random element sampled in proportion to the size of the element from a vector...
Definition: drwnStatsUtils.cpp:427
double gini(const std::vector< double > &p)
computes the gini impurity of a possibly unnormalized distribution
Definition: drwnStatsUtils.cpp:115