NPL
Neurological Programs and Libraries
statistics.h
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright 2014 Micah C Chambers (micahc.vt@gmail.com)
3  *
4  * NPL is free software: you can redistribute it and/or modify it under the
5  * terms of the BSD 2-Clause License available in LICENSE or at
6  * http://opensource.org/licenses/BSD-2-Clause
7  *
8  * @file statistics.h Tools for analyzing data, including PCA, ICA and
9  * general linear modeling.
10  *
11  *****************************************************************************/
12 
13 #ifndef STATISTICS_H
14 #define STATISTICS_H
15 
16 #include <Eigen/Dense>
17 #include "npltypes.h"
18 #include "mrimage.h"
19 
20 namespace npl {
21 
34 double sample_var(const Ref<const VectorXd> vec);
35 
52 inline
53 void regressOutLS(VectorXd& signal, const MatrixXd& X, const MatrixXd& covInv)
54 {
55  VectorXd beta = covInv*X.transpose()*signal;
56  signal -= X*beta;
57 }
58 
59 template <typename T>
60 void fillGaussian(Ref<T> m)
61 {
62  static std::random_device rd;
63  static std::default_random_engine rng(rd());
64  std::normal_distribution<double> rdist(0,1);
65 
66  for(size_t cc=0; cc<m.cols(); cc++) {
67  for(size_t rr=0; rr<m.rows(); rr++) {
68  m(rr, cc) = rdist(rng);
69  }
70  }
71 }
72 
73 
74 /******************************************
75  * \defgroup Basic Stats Functions
76  * @{
77  *****************************************/
78 
88 double gaussianPDF(double mean, double sd, double x);
89 
99 double gaussianCDF(double mean, double sd, double x);
100 
122 double gammaPDF_MS(double mean, double sd, double x);
123 
135 double mutualInformation(size_t len, double* a, double* b, size_t mbin);
136 
147 double correlation(size_t len, double* a, double* b);
148 
160 inline
161 double sample_var(int count, double sum, double sumsqr)
162 {
163  return (sumsqr-sum*sum/count)/(count-1);
164 }
165 
178 inline
179 double sample_corr(int count, double sum1, double sum2,
180  double sumsq1, double sumsq2, double s1s2)
181 {
182  return (count*s1s2-sum1*sum2)/
183  sqrt((count*sumsq1-sum1*sum1)*(count*sumsq2-sum2*sum2));
184 }
185 
187 {
191  VectorXd yhat;
192 
196  VectorXd bhat;
197 
201  double ssres;
202 
206  double sigmahat;
207 
211  double rsqr;
212 
217  double adj_rsqr;
218 
222  VectorXd std_err;
223 
227  double dof;
228 
232  VectorXd t;
233 
237  VectorXd p;
238 };
239 
245 {
246 public:
255  StudentsT(int dof = 2, double dt = 0.1, double tmax = 20);
256 
263  void setDOF(double dof);
264 
272  void setStepT(double dt);
273 
282  void setMaxT(double tmax);
283 
291  double cumulative(double t) const;
292 
300  double cdf(double t) const { return cumulative(t); };
301 
309  double density(double t) const;
310 
318  double pdf(double t) const { return density(t); };
319 
328  double icdf(double t) const;
329 
338  double tthresh(double p) const { return icdf(p); };
339 
340 private:
341  void init();
342 
343  double m_dt;
344  double m_tmax;
345  int m_dof;
346  std::vector<double> m_cdf;
347  std::vector<double> m_pdf;
348  std::vector<double> m_tvals;
349 };
350 
351 
370 void regress(RegrResult* out,
371  const Ref<const VectorXd> y,
372  const Ref<const MatrixXd> X,
373  const Ref<const VectorXd> covInv,
374  const Ref<const MatrixXd> Xinv,
375  const StudentsT& distrib);
376 
390 void regress(RegrResult* out,
391  const Ref<const VectorXd> y,
392  const Ref<const MatrixXd> X);
393 
394 /******************************************
395  * @}
396  * \defgroup Matrix Decompositions
397  * @{
398  *****************************************/
399 
410 void randomizePowerIterationSVD(const Ref<const MatrixXd> A,
411  size_t subsize, size_t poweriters, MatrixXd& U, VectorXd& E,
412  MatrixXd& V);
413 
414 void randomizePowerIterationSVD(const Ref<const MatrixXd> A,
415  double tol, size_t startrank, size_t maxrank, size_t poweriters,
416  MatrixXd& U, VectorXd& E, MatrixXd& V);
417 
438 MatrixXd pca(const Ref<const MatrixXd> X, double varth = 1, int odim = -1);
439 
461 MatrixXd rpiPCA(const Ref<const MatrixXd> X, double varth, int odim);
462 
492 MatrixXd symICA(const Ref<const MatrixXd> Xin, MatrixXd* unmix = NULL);
493 
524 MatrixXd asymICA(const Ref<const MatrixXd> Xin, MatrixXd* unmix = NULL);
525 
533 MatrixXd pseudoInverse(const Ref<const MatrixXd> X);
534 
554 VectorXd shootingRegr(const Ref<const MatrixXd> X,
555  const Ref<const VectorXd> y, double gamma);
556 
576 VectorXd activeShootingRegr(const Ref<const MatrixXd> X,
577  const Ref<const VectorXd> y, double gamma);
578 
579 
592 MatrixXd pcacov(const Ref<const MatrixXd> cov, double varth);
593 
594 /* @}
595  * \defgroup Clustering algorithms
596  * @{
597  */
598 
615 void expMax1D(const Ref<const VectorXd> data,
616  vector<std::function<double(double,double,double)>> pdfs,
617  Ref<VectorXd> mean, Ref<VectorXd> sd, Ref<VectorXd> prior,
618  std::string plotfile = "");
619 
635 void gaussGammaMixtureModel(const Ref<const VectorXd> data,
636  Ref<VectorXd> mu, Ref<VectorXd> sd, Ref<VectorXd> prior,
637  std::string plotfile);
638 
650 void approxKMeans(const Ref<const MatrixXd> samples,
651  size_t nclass, MatrixXd& means);
652 
657 {
658 public:
664  Classifier(size_t rank) : ndim(rank), maxit(-1), m_valid(false) {};
665 
674  virtual
675  Eigen::VectorXi classify(const Ref<const MatrixXd> samples) = 0;
676 
687  virtual
688  size_t classify(const Ref<const MatrixXd> samples, Ref<VectorXi> oclass) = 0;
689 
702  virtual
703  int update(const Ref<const MatrixXd> samples, bool reinit = false) = 0;
704 
712  inline
713  void compute(const Ref<const MatrixXd> samples)
714  {
715  update(samples, true);
716  };
717 
722  const int ndim;
723 
727  int maxit;
728 protected:
732  bool m_valid;
733 
734 };
735 
739 class KMeans : public Classifier
740 {
741 public:
748  KMeans(size_t rank, size_t k = 2);
749 
756  void setk(size_t ngroups);
757 
764  void updateMeans(const Ref<const MatrixXd> newmeans);
765 
774  void updateMeans(const Ref<const MatrixXd> samples,
775  const Ref<const Eigen::VectorXi> classes);
776 
785  VectorXi classify(const Ref<const MatrixXd> samples);
786 
795  size_t classify(const Ref<const MatrixXd> samples, Ref<VectorXi> oclass);
796 
809  int update(const Ref<const MatrixXd> samples, bool reinit = false);
810 
816  const MatrixXd& getMeans() { return m_mu; };
817 
818 private:
822  size_t m_k;
823 
827  MatrixXd m_mu;
828 };
829 
833 class ExpMax : public Classifier
834 {
835 public:
842  ExpMax(size_t rank, size_t k = 2);
843 
850  void setk(size_t ngroups);
851 
864  void updateMeanCovTau(const Ref<const MatrixXd> newmeans, const Ref<const MatrixXd> newcovs,
865  const Ref<const VectorXd> tau);
866 
874  void updateMeanCovTau(const Ref<const MatrixXd> samples, Ref<MatrixXd> prob);
875 
886  double expectation(const Ref<const MatrixXd> samples, Ref<MatrixXd> prob);
887 
896  Eigen::VectorXi classify(const Ref<const MatrixXd> samples);
897 
906  size_t classify(const Ref<const MatrixXd> samples, Ref<VectorXi> oclass);
907 
920  int update(const Ref<const MatrixXd> samples, bool reinit = false);
921 
922 
928  const MatrixXd& getMeans() { return m_mu; };
929 
936  const MatrixXd& getCovs() { return m_cov; };
937 private:
941  size_t m_k;
942 
946  MatrixXd m_mu;
947 
953  MatrixXd m_cov;
954 
958  MatrixXd m_covinv;
959 
964  VectorXd m_tau;
965 
969  double m_ll;
970 };
971 
989 int fastSearchFindDP(const Eigen::MatrixXf& samples, double thresh,
990  double outthresh, Eigen::VectorXi& classes, bool brute = false);
991 
1005 int findDensityPeaks(const Eigen::MatrixXf& samples, double thresh,
1006  Eigen::VectorXf& rho, Eigen::VectorXf& delta,
1007  Eigen::VectorXi& parent);
1008 
1022 int findDensityPeaks_brute(const Eigen::MatrixXf& samples, double thresh,
1023  Eigen::VectorXf& rho, Eigen::VectorXf& delta,
1024  Eigen::VectorXi& parent);
1025 
1030 }
1031 
1032 #endif // STATISTICS_H
MatrixXd pca(const Ref< const MatrixXd > X, double varth=1, int odim=-1)
Computes the Principal Components of input matrix X.
void setk(size_t ngroups)
Update the number of groups. Note that this invalidates any current information.
Definition: accessors.h:29
double sigmahat
sigma hat - estimate standard deviation of noise
Definition: statistics.h:206
Classifier(size_t rank)
Initializes the classifier.
Definition: statistics.h:664
void setMaxT(double tmax)
Set the maximum t for numerical integration, and recompute the cdf/pdf caches.
ExpMax(size_t rank, size_t k=2)
Constructor for k-means class.
double cumulative(double t) const
Get the cumulative probability at some t value.
double sample_var(const Ref< const VectorXd > vec)
Computes the statistical variance of a column vector.
VectorXd p
Significance of each of the regressors.
Definition: statistics.h:237
double gammaPDF_MS(double mean, double sd, double x)
PDF for the gamma distribution, if mean is negative then it is assumed that x should be negated as we...
MatrixXd asymICA(const Ref< const MatrixXd > Xin, MatrixXd *unmix=NULL)
Computes the Independent Components of input matrix X using sequential component extraction. Note that you should run PCA on X before running ICA.
void regress(RegrResult *out, const Ref< const VectorXd > y, const Ref< const MatrixXd > X, const Ref< const VectorXd > covInv, const Ref< const MatrixXd > Xinv, const StudentsT &distrib)
Computes the Ordinary Least Square predictors, beta for.
double pdf(double t) const
Get the probability density at some t value.
Definition: statistics.h:318
void updateMeanCovTau(const Ref< const MatrixXd > newmeans, const Ref< const MatrixXd > newcovs, const Ref< const VectorXd > tau)
Sets the mean matrix. Each row of the matrix is a ND-mean, where N is the number of columns...
double tthresh(double p) const
Get the T-score that corresponds to a particular p-value. Alias for icdf.
Definition: statistics.h:338
double density(double t) const
Get the probability density at some t value.
double icdf(double t) const
Get the T-score that corresponds to a particular p-value. Alias for tthresh.
int findDensityPeaks(const Eigen::MatrixXf &samples, double thresh, Eigen::VectorXf &rho, Eigen::VectorXf &delta, Eigen::VectorXi &parent)
Computes Density and Peak computation for Fast Search and Find of Density Peaks algorithm.
int fastSearchFindDP(const Eigen::MatrixXf &samples, double thresh, double outthresh, Eigen::VectorXi &classes, bool brute=false)
Algorithm of unsupervised learning (clustering) based on density.
double adj_rsqr
Coefficient of determination, corrected for the number of regressors.
Definition: statistics.h:217
Student's T-distribution. A cache of the Probability Density Function and cumulative density function...
Definition: statistics.h:244
virtual int update(const Ref< const MatrixXd > samples, bool reinit=false)=0
Updates the classifier with new samples, if reinit is true then no prior information will be used...
Expectation Maximization With Gaussian Mixture Model.
Definition: statistics.h:833
void expMax1D(const Ref< const VectorXd > data, vector< std::function< double(double, double, double)>> pdfs, Ref< VectorXd > mean, Ref< VectorXd > sd, Ref< VectorXd > prior, std::string plotfile="")
Computes the mean and standard deviation of multiple distributions based on 1D data. The probability distribution functions should be passed in through a vector of function objects (pdfs) taking mu/sd/x.
KMeans(size_t rank, size_t k=2)
Constructor for k-means class.
double gaussianCDF(double mean, double sd, double x)
1D Gaussian cumulative distribution function
void setDOF(double dof)
Change the degress of freedom, update cache.
double ssres
Sum of square of the residuals.
Definition: statistics.h:201
int findDensityPeaks_brute(const Eigen::MatrixXf &samples, double thresh, Eigen::VectorXf &rho, Eigen::VectorXf &delta, Eigen::VectorXi &parent)
Computes Density and Peak computation for Fast Search and Find of Density Peaks algorithm. This is a slower, non-bin based version.
void setStepT(double dt)
Step in t to use for computing the CDF, smaller means more precision although in reality the distribu...
VectorXd shootingRegr(const Ref< const MatrixXd > X, const Ref< const VectorXd > y, double gamma)
Performs LASSO regression using the 'shooting' algorithm of.
bool m_valid
Whether the classifier has been initialized yet.
Definition: statistics.h:732
VectorXd std_err
Standard errors for each of the regressors.
Definition: statistics.h:222
virtual Eigen::VectorXi classify(const Ref< const MatrixXd > samples)=0
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...
int update(const Ref< const MatrixXd > samples, bool reinit=false)
Updates the classifier with new samples, if reinit is true then no prior information will be used...
void regressOutLS(VectorXd &signal, const MatrixXd &X, const MatrixXd &covInv)
Removes the effects of X from signal (y). Note that this takes both X and the pseudoinverse of X beca...
Definition: statistics.h:53
double gaussianPDF(double mean, double sd, double x)
1D Gaussian distribution function
double cdf(double t) const
Get the cumulative probability at some t value.
Definition: statistics.h:300
double rsqr
Coefficient of determination (Rsqr)
Definition: statistics.h:211
Eigen::VectorXi classify(const Ref< const MatrixXd > samples)
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...
void updateMeans(const Ref< const MatrixXd > newmeans)
Sets the mean matrix. Each row of the matrix is a ND-mean, where N is the number of columns...
Base class for all ND classifiers.
Definition: statistics.h:656
void randomizePowerIterationSVD(const Ref< const MatrixXd > A, size_t subsize, size_t poweriters, MatrixXd &U, VectorXd &E, MatrixXd &V)
double mutualInformation(size_t len, double *a, double *b, size_t mbin)
Computes mutual information between signal a and signal b which are of length len. Marginal bins used is mbin.
StudentsT(int dof=2, double dt=0.1, double tmax=20)
Defualt constructor takes the degrees of freedom (Nu), step size for numerical CDF computation and Ma...
double sample_corr(int count, double sum1, double sum2, double sumsq1, double sumsq2, double s1s2)
Computes the sample correlation.
Definition: statistics.h:179
MatrixXd pcacov(const Ref< const MatrixXd > cov, double varth)
Computes the Principal Components of input matrix X using the covariance matrix.
VectorXi classify(const Ref< const MatrixXd > samples)
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...
VectorXd t
Students t score of each of the regressors.
Definition: statistics.h:232
void compute(const Ref< const MatrixXd > samples)
Alias for updateClasses with reinit = true. This will perform a classification scheme on all the inpu...
Definition: statistics.h:713
double correlation(size_t len, double *a, double *b)
Computes correlation between signal a and signal b which are of length len.
VectorXd bhat
Estimated Beta.
Definition: statistics.h:196
K-means classifier.
Definition: statistics.h:739
const int ndim
Number of dimensions, must be set at construction. This is the number of columns in input samples...
Definition: statistics.h:716
void setk(size_t ngroups)
Update the number of groups. Note that this invalidates any current information.
MatrixXd symICA(const Ref< const MatrixXd > Xin, MatrixXd *unmix=NULL)
Computes the Independent Components of input matrix X using symmetric decorrlation. Note that you should run PCA on X before running ICA.
int update(const Ref< const MatrixXd > samples, bool reinit=false)
Updates the classifier with new samples, if reinit is true then no prior information will be used...
int maxit
Maximum number of iterations. Set below 0 for infinite.
Definition: statistics.h:727
double expectation(const Ref< const MatrixXd > samples, Ref< MatrixXd > prob)
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...
MatrixXd pseudoInverse(const Ref< const MatrixXd > X)
Computes the pseudoinverse of the input matrix.
void gaussGammaMixtureModel(const Ref< const VectorXd > data, Ref< VectorXd > mu, Ref< VectorXd > sd, Ref< VectorXd > prior, std::string plotfile)
Computes the mean and standard deviation of multiple distributions based on 1D data. This is a special version in which a negative gamma, a gaussian and positive gamma. The means of the negative and positive gamma are relative to the center of the gaussian.
void fillGaussian(ptr< NDArray > inout)
Fills image with the linear index at each pixel.
const MatrixXd & getCovs()
Returns the current mean matrix.
Definition: statistics.h:936
void approxKMeans(const Ref< const MatrixXd > samples, size_t nclass, MatrixXd &means)
Approximates k-means using the algorithm of:
MatrixXd rpiPCA(const Ref< const MatrixXd > X, double varth, int odim)
Computes the Principal Components of input matrix X using the randomized power iteration SVD algorith...
const MatrixXd & getMeans()
Returns the current mean matrix.
Definition: statistics.h:928
VectorXd activeShootingRegr(const Ref< const MatrixXd > X, const Ref< const VectorXd > y, double gamma)
Performs LASSO regression using the 'activeShooting' algorithm of.
const MatrixXd & getMeans()
Returns the current mean matrix.
Definition: statistics.h:816
double dof
Degrees of freedom in the regression.
Definition: statistics.h:227
VectorXd yhat
Predicted y values, based on estimate of Beta.
Definition: statistics.h:191