Chapter 38 Principal component analysis
38.1 Basic statistics
Standard deviation (SD) and Variance (\(s^2\) are measures of the spread of data in a data set.
Standard deviation (SD):
\[s = \sqrt{\frac{\sum_{i=1}^{n} (X_i - \overline{X})^2}{(n-1)}}\]
Variance (\(s^2, var(X)\)): \[s^2 = \frac{\sum_{i=1}^{n} (X_i - \overline{X})^2}{(n-1)} = \frac{\sum_{i=1}^{n} (X_i - \overline{X})(X_i - \overline{X})}{(n-1)}\]
Covariance (\(cov(X,Y)\)):
\[cov(X,Y) = \frac{\sum_{i=1}^{n} (X_i - \overline{X})(Y_i - \overline{Y})}{(n-1)}\]
Covariance matrix for a set of data with n dimensions:
\[C^{n \times n} = (C_{i,j}, c_{i,j} = cov(Dim_{i}, Dim_{j})),\]
where \(C^{n \times n}\) is a matrix with \(n\) rows and \(n\) columns, and \(Dim_x\) is the \(x\)th dimension.
For n-dimentional data set, the matrix has n rows and columns and each entry in
the matrix is the result of calculating the covariance between two separate dimensions.
Eg. the entry on row 2, column 3, is the covariance value calculated between the
2nd dimension and the 3rd dimension.
Example for 3 dimensional data set, using dimensions \(x\), \(y\) and \(z\):
\[
C =
\begin{pmatrix}
cov(x,x) & cov(x,y) & cov(x,z)\\
cov(y,x) & cov(y,y) & cov(y,z)\\
cov(z,x) & cov(z,y) & cov(z,z)
\end{pmatrix}
\]
Covariations of the main diagonal turn to variance: \(cov(a,a) = var(a)\)
The matrix is symmetrical about the main diagonal since \(cov(a,b) = cov(b,a)\).
38.2 Basic linear algebra (matrices)
Example of non-eigenvector: \[ \begin{pmatrix} 2 & 3\\ 2 & 1\\ \end{pmatrix} \times \begin{pmatrix} 1\\ 3 \end{pmatrix} = \begin{pmatrix} 2\cdot1+3\cdot3\\ 2\cdot1+1\cdot3 \end{pmatrix} = \begin{pmatrix} 11\\ 5 \end{pmatrix} \] Eigenvector: \[ \begin{pmatrix} 2 & 3\\ 2 & 1\\ \end{pmatrix} \times \begin{pmatrix} 3\\ 2 \end{pmatrix} = \begin{pmatrix} 12\\ 8 \end{pmatrix} = 4 \times \begin{pmatrix} 3\\ 2 \end{pmatrix} \]
### Principal Component Analysis
# Source: https://www.datacamp.com/community/tutorials/pca-analysis-r
# PCA allows to visualize datasets with many variables
# PCA is a linear transformation which simplify many variables into a smaller number of "Principal Components".
### Example 1 "mtcars"
<- prcomp(mtcars[,c(1:7,10,11)], center = TRUE,scale. = TRUE)
mtcars.pca
summary(mtcars.pca)
str(mtcars.pca)
library(devtools)
install_github("vqv/ggbiplot")
library(ggbiplot)
ggbiplot(mtcars.pca)
ggbiplot(mtcars.pca, labels=rownames(mtcars))
ggbiplot(mtcars.pca,ellipse=TRUE,choices=c(3,4), labels=rownames(mtcars), groups=mtcars.country)
ggbiplot(mtcars.pca,ellipse=TRUE,circle=TRUE, labels=rownames(mtcars), groups=mtcars.country)
ggbiplot(mtcars.pca,ellipse=TRUE,obs.scale = 1, var.scale = 1, labels=rownames(mtcars), groups=mtcars.country)
ggbiplot(mtcars.pca,ellipse=TRUE,obs.scale = 1, var.scale = 1,var.axes=FALSE, labels=rownames(mtcars), groups=mtcars.country)
### Example 2 "USArrests"
# Violent Crime Rates by US State
?USArrests dim(USArrests)
dimnames(USArrests)
apply(USArrests,2,mean) # finding mean of all
# big variance between samples, need to standartize
<-prcomp(USArrests,scale=TRUE)
pca.out
pca.outsummary(pca.out)
names(pca.out)
biplot(pca.out,scale = 0, cex=0.65)
# Principal component analysis
=row.names(USArrests)
states
statesnames(USArrests)
apply(USArrests, 2, mean)
apply(USArrests, 2, var)
=prcomp(USArrests, scale=TRUE)
pr.outnames(pr.out)
$center
pr.out$scale
pr.out$rotation
pr.outdim(pr.out$x)
biplot(pr.out, scale=0)
$rotation=-pr.out$rotation
pr.out$x=-pr.out$x
pr.outbiplot(pr.out, scale=0)
$sdev
pr.out=pr.out$sdev^2
pr.var
pr.var=pr.var/sum(pr.var)
pve
pveplot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')
=c(1,2,8,-3)
acumsum(a)
# Лабораторная работа 2 к главе 10: Кластеризация
# Кластеризация по методу К средних
set.seed(2)
=matrix(rnorm(50*2), ncol=2)
x1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4
x[=kmeans(x,2,nstart=20)
km.out$cluster
km.outplot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2)
set.seed(4)
=kmeans(x,3,nstart=20)
km.out
km.outplot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=3", xlab="", ylab="", pch=20, cex=2)
set.seed(3)
=kmeans(x,3,nstart=1)
km.out$tot.withinss
km.out=kmeans(x,3,nstart=20)
km.out$tot.withinss
km.out
# Иерархическая кластеризация
=hclust(dist(x), method="complete")
hc.complete=hclust(dist(x), method="average")
hc.average=hclust(dist(x), method="single")
hc.singlepar(mfrow=c(1,3))
plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=.9)
plot(hc.average, main="Average Linkage", xlab="", sub="", cex=.9)
plot(hc.single, main="Single Linkage", xlab="", sub="", cex=.9)
cutree(hc.complete, 2)
cutree(hc.average, 2)
cutree(hc.single, 2)
cutree(hc.single, 4)
=scale(x)
xscplot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")
=matrix(rnorm(30*3), ncol=3)
x=as.dist(1-cor(t(x)))
ddplot(hclust(dd, method="complete"), main="Complete Linkage with Correlation-Based Distance", xlab="", sub="")
# Лабораторная работа 3 к главе 10: Анализ данных NCI60
# Данные NCI60
library(ISLR)
=NCI60$labs
nci.labs=NCI60$data
nci.datadim(nci.data)
1:4]
nci.labs[table(nci.labs)
# PCA в приложении к данным NCI60
=prcomp(nci.data, scale=TRUE)
pr.out=function(vec){
Cols=rainbow(length(unique(vec)))
colsreturn(cols[as.numeric(as.factor(vec))])
}par(mfrow=c(1,2))
plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z2")
plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z3")
summary(pr.out)
plot(pr.out)
=100*pr.out$sdev^2/sum(pr.out$sdev^2)
pvepar(mfrow=c(1,2))
plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue")
plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")
# Кластеризация наблюдений из нобора данных NCI60
=scale(nci.data)
sd.datapar(mfrow=c(1,3))
=dist(sd.data)
data.distplot(hclust(data.dist), labels=nci.labs, main="Complete Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="average"), labels=nci.labs, main="Average Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="single"), labels=nci.labs, main="Single Linkage", xlab="", sub="",ylab="")
=hclust(dist(sd.data))
hc.out=cutree(hc.out,4)
hc.clusterstable(hc.clusters,nci.labs)
par(mfrow=c(1,1))
plot(hc.out, labels=nci.labs)
abline(h=139, col="red")
hc.outset.seed(2)
=kmeans(sd.data, 4, nstart=20)
km.out=km.out$cluster
km.clusterstable(km.clusters,hc.clusters)
=hclust(dist(pr.out$x[,1:5]))
hc.outplot(hc.out, labels=nci.labs, main="Hier. Clust. on First Five Score Vectors")
table(cutree(hc.out,4), nci.labs)
38.2.1 t-SNE - Stochastic Neighbor Embedding
library(Rtsne)
# data loading
<- read.csv("~/github/dar/012_t-SNE/DATA/train.csv")
train
head(train)[1:20]
dim(train)
## get labels as factors
<- train$label
labels $label<-as.factor(train$label)
train
## for plotting
= rainbow(length(unique(train$label)))
colors names(colors) = unique(train$label)
## Executing the algorithm on curated data
<- Rtsne(train[,-1], dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)
tsne
## Execution time
<- system.time(Rtsne(train[,-1], dims = 2, perplexity=30, verbose=TRUE, max_iter = 500))
exeTimeTsne
## Plotting
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels=train$label, col=colors[train$label])
Sources
https://www.analyticsvidhya.com/blog/2017/01/t-sne-implementation-r-python/