~mchamber/npl/statistics_8h_source.html

 /******************************************************************************

  * Copyright 2014 Micah C Chambers (micahc.vt@gmail.com)

  *

  * NPL is free software: you can redistribute it and/or modify it under the

  * terms of the BSD 2-Clause License available in LICENSE or at

  * http://opensource.org/licenses/BSD-2-Clause

  *

  * @file statistics.h Tools for analyzing data, including PCA, ICA and

  * general linear modeling.

  *

  *****************************************************************************/


 #ifndef STATISTICS_H

 #define STATISTICS_H


 #include <Eigen/Dense>

 #include "npltypes.h"

 #include "mrimage.h"


 namespace npl {


 double sample_var(const Ref<const VectorXd> vec);


 inline

 void regressOutLS(VectorXd& signal, const MatrixXd& X, const MatrixXd& covInv)

 {

     VectorXd beta = covInv*X.transpose()*signal;

     signal -= X*beta;

 }


 template <typename T>

 void fillGaussian(Ref<T> m)

 {

     static std::random_device rd;

     static std::default_random_engine rng(rd());

     std::normal_distribution<double> rdist(0,1);


     for(size_t cc=0; cc<m.cols(); cc++) {

         for(size_t rr=0; rr<m.rows(); rr++) {

             m(rr, cc) = rdist(rng);

         }

     }

 }


 /******************************************

  * \defgroup Basic Stats Functions

  * @{

  *****************************************/


 double gaussianPDF(double mean, double sd, double x);


 double gaussianCDF(double mean, double sd, double x);


 double gammaPDF_MS(double mean, double sd, double x);


 double mutualInformation(size_t len, double* a, double* b, size_t mbin);


 double correlation(size_t len, double* a, double* b);


 inline

 double sample_var(int count, double sum, double sumsqr)

 {

     return (sumsqr-sum*sum/count)/(count-1);

 }


 inline

 double sample_corr(int count, double sum1, double sum2,

         double sumsq1, double sumsq2, double s1s2)

 {

     return (count*s1s2-sum1*sum2)/

             sqrt((count*sumsq1-sum1*sum1)*(count*sumsq2-sum2*sum2));

 }


 struct RegrResult

 {

     VectorXd yhat;


     VectorXd bhat;


     double ssres;


     double sigmahat;


     double rsqr;


     double adj_rsqr;


     VectorXd std_err;


     double dof;


     VectorXd t;


     VectorXd p;

 };


 class StudentsT

 {

 public:

     StudentsT(int dof = 2, double dt = 0.1, double tmax = 20);


     void setDOF(double dof);


     void setStepT(double dt);


     void setMaxT(double tmax);


     double cumulative(double t) const;


     double cdf(double t) const { return cumulative(t); };


     double density(double t) const;


     double pdf(double t) const { return density(t); };


     double icdf(double t) const;


     double tthresh(double p) const { return icdf(p); };


 private:

     void init();


     double m_dt;

     double m_tmax;

     int m_dof;

     std::vector<double> m_cdf;

     std::vector<double> m_pdf;

     std::vector<double> m_tvals;

 };


 void regress(RegrResult* out,

         const Ref<const VectorXd> y,

         const Ref<const MatrixXd> X,

         const Ref<const VectorXd> covInv,

         const Ref<const MatrixXd> Xinv,

         const StudentsT& distrib);


 void regress(RegrResult* out,

         const Ref<const VectorXd> y,

         const Ref<const MatrixXd> X);


 /******************************************

  * @}

  * \defgroup Matrix Decompositions

  * @{

  *****************************************/


 void randomizePowerIterationSVD(const Ref<const MatrixXd> A,

         size_t subsize, size_t poweriters, MatrixXd& U, VectorXd& E,

         MatrixXd& V);


 void randomizePowerIterationSVD(const Ref<const MatrixXd> A,

         double tol, size_t startrank, size_t maxrank, size_t poweriters,

         MatrixXd& U, VectorXd& E, MatrixXd& V);


 MatrixXd pca(const Ref<const MatrixXd> X, double varth = 1, int odim = -1);


 MatrixXd rpiPCA(const Ref<const MatrixXd> X, double varth, int odim);


 MatrixXd symICA(const Ref<const MatrixXd> Xin, MatrixXd* unmix = NULL);


 MatrixXd asymICA(const Ref<const MatrixXd> Xin, MatrixXd* unmix = NULL);


 MatrixXd pseudoInverse(const Ref<const MatrixXd> X);


 VectorXd shootingRegr(const Ref<const MatrixXd> X,

         const Ref<const VectorXd> y, double gamma);


 VectorXd activeShootingRegr(const Ref<const MatrixXd> X,

         const Ref<const VectorXd> y, double gamma);


 MatrixXd pcacov(const Ref<const MatrixXd> cov, double varth);


 /* @}

  * \defgroup Clustering algorithms

  * @{

  */


 void expMax1D(const Ref<const VectorXd> data,

         vector<std::function<double(double,double,double)>> pdfs,

         Ref<VectorXd> mean, Ref<VectorXd> sd, Ref<VectorXd> prior,

         std::string plotfile = "");


 void gaussGammaMixtureModel(const Ref<const VectorXd> data,

         Ref<VectorXd> mu, Ref<VectorXd> sd, Ref<VectorXd> prior,

         std::string plotfile);


 void approxKMeans(const Ref<const MatrixXd> samples,

         size_t nclass, MatrixXd& means);


 class Classifier

 {

 public:

     Classifier(size_t rank) : ndim(rank), maxit(-1), m_valid(false) {};


     virtual

     Eigen::VectorXi classify(const Ref<const MatrixXd> samples) = 0;


     virtual

     size_t classify(const Ref<const MatrixXd> samples, Ref<VectorXi> oclass) = 0;


     virtual

         int update(const Ref<const MatrixXd> samples, bool reinit = false) = 0;


     inline

     void compute(const Ref<const MatrixXd> samples)

     {

         update(samples, true);

     };


     const int ndim;


     int maxit;

 protected:

     bool m_valid;


 };


 class KMeans : public Classifier

 {

 public:

     KMeans(size_t rank, size_t k = 2);


     void setk(size_t ngroups);


     void updateMeans(const Ref<const MatrixXd> newmeans);


     void updateMeans(const Ref<const MatrixXd> samples,

             const Ref<const Eigen::VectorXi> classes);


     VectorXi classify(const Ref<const MatrixXd> samples);


     size_t classify(const Ref<const MatrixXd> samples, Ref<VectorXi> oclass);


     int update(const Ref<const MatrixXd> samples, bool reinit = false);


     const MatrixXd& getMeans() { return m_mu; };


 private:

     size_t m_k;


     MatrixXd m_mu;

 };


 class ExpMax : public Classifier

 {

 public:

     ExpMax(size_t rank, size_t k = 2);


     void setk(size_t ngroups);


     void updateMeanCovTau(const Ref<const MatrixXd> newmeans, const Ref<const MatrixXd> newcovs,

             const Ref<const VectorXd> tau);


     void updateMeanCovTau(const Ref<const MatrixXd> samples, Ref<MatrixXd> prob);


     double expectation(const Ref<const MatrixXd> samples, Ref<MatrixXd> prob);


     Eigen::VectorXi classify(const Ref<const MatrixXd> samples);


     size_t classify(const Ref<const MatrixXd> samples, Ref<VectorXi> oclass);


     int update(const Ref<const MatrixXd> samples, bool reinit = false);


     const MatrixXd& getMeans() { return m_mu; };


     const MatrixXd& getCovs() { return m_cov; };

 private:

     size_t m_k;


     MatrixXd m_mu;


     MatrixXd m_cov;


     MatrixXd m_covinv;


     VectorXd m_tau;


     double m_ll;

 };


 int fastSearchFindDP(const Eigen::MatrixXf& samples, double thresh,

         double outthresh, Eigen::VectorXi& classes, bool brute = false);


 int findDensityPeaks(const Eigen::MatrixXf& samples, double thresh,

         Eigen::VectorXf& rho, Eigen::VectorXf& delta,

         Eigen::VectorXi& parent);


 int findDensityPeaks_brute(const Eigen::MatrixXf& samples, double thresh,

         Eigen::VectorXf& rho, Eigen::VectorXf& delta,

         Eigen::VectorXi& parent);


 }


 #endif // STATISTICS_H

npl::pca
MatrixXd pca(const Ref< const MatrixXd > X, double varth=1, int odim=-1)
Computes the Principal Components of input matrix X.

npl::KMeans::setk
void setk(size_t ngroups)
Update the number of groups. Note that this invalidates any current information.

npl
Definition: accessors.h:29

npl::RegrResult::sigmahat
double sigmahat
sigma hat - estimate standard deviation of noise
Definition: statistics.h:206

npl::Classifier::Classifier
Classifier(size_t rank)
Initializes the classifier.
Definition: statistics.h:664

npl::StudentsT::setMaxT
void setMaxT(double tmax)
Set the maximum t for numerical integration, and recompute the cdf/pdf caches.

npl::ExpMax::ExpMax
ExpMax(size_t rank, size_t k=2)
Constructor for k-means class.

npl::StudentsT::cumulative
double cumulative(double t) const
Get the cumulative probability at some t value.

npl::sample_var
double sample_var(const Ref< const VectorXd > vec)
Computes the statistical variance of a column vector.

npl::RegrResult::p
VectorXd p
Significance of each of the regressors.
Definition: statistics.h:237

npl::gammaPDF_MS
double gammaPDF_MS(double mean, double sd, double x)
PDF for the gamma distribution, if mean is negative then it is assumed that x should be negated as we...

npl::asymICA
MatrixXd asymICA(const Ref< const MatrixXd > Xin, MatrixXd *unmix=NULL)
Computes the Independent Components of input matrix X using sequential component extraction. Note that you should run PCA on X before running ICA.

npl::regress
void regress(RegrResult *out, const Ref< const VectorXd > y, const Ref< const MatrixXd > X, const Ref< const VectorXd > covInv, const Ref< const MatrixXd > Xinv, const StudentsT &distrib)
Computes the Ordinary Least Square predictors, beta for.

npl::StudentsT::pdf
double pdf(double t) const
Get the probability density at some t value.
Definition: statistics.h:318

npl::ExpMax::updateMeanCovTau
void updateMeanCovTau(const Ref< const MatrixXd > newmeans, const Ref< const MatrixXd > newcovs, const Ref< const VectorXd > tau)
Sets the mean matrix. Each row of the matrix is a ND-mean, where N is the number of columns...

npl::StudentsT::tthresh
double tthresh(double p) const
Get the T-score that corresponds to a particular p-value. Alias for icdf.
Definition: statistics.h:338

npl::StudentsT::density
double density(double t) const
Get the probability density at some t value.

npl::StudentsT::icdf
double icdf(double t) const
Get the T-score that corresponds to a particular p-value. Alias for tthresh.

npl::findDensityPeaks
int findDensityPeaks(const Eigen::MatrixXf &samples, double thresh, Eigen::VectorXf &rho, Eigen::VectorXf &delta, Eigen::VectorXi &parent)
Computes Density and Peak computation for Fast Search and Find of Density Peaks algorithm.

npl::fastSearchFindDP
int fastSearchFindDP(const Eigen::MatrixXf &samples, double thresh, double outthresh, Eigen::VectorXi &classes, bool brute=false)
Algorithm of unsupervised learning (clustering) based on density.

npl::RegrResult::adj_rsqr
double adj_rsqr
Coefficient of determination, corrected for the number of regressors.
Definition: statistics.h:217

npl::StudentsT
Student's T-distribution. A cache of the Probability Density Function and cumulative density function...
Definition: statistics.h:244

npl::Classifier::update
virtual int update(const Ref< const MatrixXd > samples, bool reinit=false)=0
Updates the classifier with new samples, if reinit is true then no prior information will be used...

npl::ExpMax
Expectation Maximization With Gaussian Mixture Model.
Definition: statistics.h:833

npl::expMax1D
void expMax1D(const Ref< const VectorXd > data, vector< std::function< double(double, double, double)>> pdfs, Ref< VectorXd > mean, Ref< VectorXd > sd, Ref< VectorXd > prior, std::string plotfile="")
Computes the mean and standard deviation of multiple distributions based on 1D data. The probability distribution functions should be passed in through a vector of function objects (pdfs) taking mu/sd/x.

npl::KMeans::KMeans
KMeans(size_t rank, size_t k=2)
Constructor for k-means class.

npl::gaussianCDF
double gaussianCDF(double mean, double sd, double x)
1D Gaussian cumulative distribution function

npl::StudentsT::setDOF
void setDOF(double dof)
Change the degress of freedom, update cache.

npl::RegrResult::ssres
double ssres
Sum of square of the residuals.
Definition: statistics.h:201

npl::findDensityPeaks_brute
int findDensityPeaks_brute(const Eigen::MatrixXf &samples, double thresh, Eigen::VectorXf &rho, Eigen::VectorXf &delta, Eigen::VectorXi &parent)
Computes Density and Peak computation for Fast Search and Find of Density Peaks algorithm. This is a slower, non-bin based version.

npl::StudentsT::setStepT
void setStepT(double dt)
Step in t to use for computing the CDF, smaller means more precision although in reality the distribu...

npl::shootingRegr
VectorXd shootingRegr(const Ref< const MatrixXd > X, const Ref< const VectorXd > y, double gamma)
Performs LASSO regression using the 'shooting' algorithm of.

npl::Classifier::m_valid
bool m_valid
Whether the classifier has been initialized yet.
Definition: statistics.h:732

npl::RegrResult::std_err
VectorXd std_err
Standard errors for each of the regressors.
Definition: statistics.h:222

npl::Classifier::classify
virtual Eigen::VectorXi classify(const Ref< const MatrixXd > samples)=0
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...

npl::KMeans::update
int update(const Ref< const MatrixXd > samples, bool reinit=false)
Updates the classifier with new samples, if reinit is true then no prior information will be used...

npl::regressOutLS
void regressOutLS(VectorXd &signal, const MatrixXd &X, const MatrixXd &covInv)
Removes the effects of X from signal (y). Note that this takes both X and the pseudoinverse of X beca...
Definition: statistics.h:53

npl::gaussianPDF
double gaussianPDF(double mean, double sd, double x)
1D Gaussian distribution function

npl::StudentsT::cdf
double cdf(double t) const
Get the cumulative probability at some t value.
Definition: statistics.h:300

npl::RegrResult::rsqr
double rsqr
Coefficient of determination (Rsqr)
Definition: statistics.h:211

npl::RegrResult
Definition: statistics.h:186

npltypes.h

npl::ExpMax::classify
Eigen::VectorXi classify(const Ref< const MatrixXd > samples)
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...

npl::KMeans::updateMeans
void updateMeans(const Ref< const MatrixXd > newmeans)
Sets the mean matrix. Each row of the matrix is a ND-mean, where N is the number of columns...

npl::Classifier
Base class for all ND classifiers.
Definition: statistics.h:656

npl::randomizePowerIterationSVD
void randomizePowerIterationSVD(const Ref< const MatrixXd > A, size_t subsize, size_t poweriters, MatrixXd &U, VectorXd &E, MatrixXd &V)

npl::mutualInformation
double mutualInformation(size_t len, double *a, double *b, size_t mbin)
Computes mutual information between signal a and signal b which are of length len. Marginal bins used is mbin.

npl::StudentsT::StudentsT
StudentsT(int dof=2, double dt=0.1, double tmax=20)
Defualt constructor takes the degrees of freedom (Nu), step size for numerical CDF computation and Ma...

npl::sample_corr
double sample_corr(int count, double sum1, double sum2, double sumsq1, double sumsq2, double s1s2)
Computes the sample correlation.
Definition: statistics.h:179

npl::pcacov
MatrixXd pcacov(const Ref< const MatrixXd > cov, double varth)
Computes the Principal Components of input matrix X using the covariance matrix.

npl::KMeans::classify
VectorXi classify(const Ref< const MatrixXd > samples)
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...

npl::RegrResult::t
VectorXd t
Students t score of each of the regressors.
Definition: statistics.h:232

npl::Classifier::compute
void compute(const Ref< const MatrixXd > samples)
Alias for updateClasses with reinit = true. This will perform a classification scheme on all the inpu...
Definition: statistics.h:713

npl::correlation
double correlation(size_t len, double *a, double *b)
Computes correlation between signal a and signal b which are of length len.

mrimage.h

npl::RegrResult::bhat
VectorXd bhat
Estimated Beta.
Definition: statistics.h:196

npl::KMeans
K-means classifier.
Definition: statistics.h:739

npl::Classifier::ndim
const int ndim
Number of dimensions, must be set at construction. This is the number of columns in input samples...
Definition: statistics.h:716

npl::ExpMax::setk
void setk(size_t ngroups)
Update the number of groups. Note that this invalidates any current information.

npl::symICA
MatrixXd symICA(const Ref< const MatrixXd > Xin, MatrixXd *unmix=NULL)
Computes the Independent Components of input matrix X using symmetric decorrlation. Note that you should run PCA on X before running ICA.

npl::ExpMax::update
int update(const Ref< const MatrixXd > samples, bool reinit=false)
Updates the classifier with new samples, if reinit is true then no prior information will be used...

npl::Classifier::maxit
int maxit
Maximum number of iterations. Set below 0 for infinite.
Definition: statistics.h:727

npl::ExpMax::expectation
double expectation(const Ref< const MatrixXd > samples, Ref< MatrixXd > prob)
Given a matrix of samples (Samples x Dims, sample on each row), apply the classifier to each sample a...

npl::pseudoInverse
MatrixXd pseudoInverse(const Ref< const MatrixXd > X)
Computes the pseudoinverse of the input matrix.

npl::gaussGammaMixtureModel
void gaussGammaMixtureModel(const Ref< const VectorXd > data, Ref< VectorXd > mu, Ref< VectorXd > sd, Ref< VectorXd > prior, std::string plotfile)
Computes the mean and standard deviation of multiple distributions based on 1D data. This is a special version in which a negative gamma, a gaussian and positive gamma. The means of the negative and positive gamma are relative to the center of the gaussian.

npl::fillGaussian
void fillGaussian(ptr< NDArray > inout)
Fills image with the linear index at each pixel.

npl::ExpMax::getCovs
const MatrixXd & getCovs()
Returns the current mean matrix.
Definition: statistics.h:936

npl::approxKMeans
void approxKMeans(const Ref< const MatrixXd > samples, size_t nclass, MatrixXd &means)
Approximates k-means using the algorithm of:

npl::rpiPCA
MatrixXd rpiPCA(const Ref< const MatrixXd > X, double varth, int odim)
Computes the Principal Components of input matrix X using the randomized power iteration SVD algorith...

npl::ExpMax::getMeans
const MatrixXd & getMeans()
Returns the current mean matrix.
Definition: statistics.h:928

npl::activeShootingRegr
VectorXd activeShootingRegr(const Ref< const MatrixXd > X, const Ref< const VectorXd > y, double gamma)
Performs LASSO regression using the 'activeShooting' algorithm of.

npl::KMeans::getMeans
const MatrixXd & getMeans()
Returns the current mean matrix.
Definition: statistics.h:816

npl::RegrResult::dof
double dof
Degrees of freedom in the regression.
Definition: statistics.h:227

npl::RegrResult::yhat
VectorXd yhat
Predicted y values, based on estimate of Beta.
Definition: statistics.h:191