Chapter 38 Principal component analysis

38.1 Basic statistics

Standard deviation (SD) and Variance (\(s^2\) are measures of the spread of data in a data set.

Standard deviation (SD):
\[s = \sqrt{\frac{\sum_{i=1}^{n} (X_i - \overline{X})^2}{(n-1)}}\]

Variance (\(s^2, var(X)\)): \[s^2 = \frac{\sum_{i=1}^{n} (X_i - \overline{X})^2}{(n-1)} = \frac{\sum_{i=1}^{n} (X_i - \overline{X})(X_i - \overline{X})}{(n-1)}\]

Covariance (\(cov(X,Y)\)): \[cov(X,Y) = \frac{\sum_{i=1}^{n} (X_i - \overline{X})(Y_i - \overline{Y})}{(n-1)}\]
Covariance matrix for a set of data with n dimensions: \[C^{n \times n} = (C_{i,j}, c_{i,j} = cov(Dim_{i}, Dim_{j})),\]
where \(C^{n \times n}\) is a matrix with \(n\) rows and \(n\) columns, and \(Dim_x\) is the \(x\)th dimension.
For n-dimentional data set, the matrix has n rows and columns and each entry in the matrix is the result of calculating the covariance between two separate dimensions. Eg. the entry on row 2, column 3, is the covariance value calculated between the 2nd dimension and the 3rd dimension.
Example for 3 dimensional data set, using dimensions \(x\), \(y\) and \(z\):

\[ C = \begin{pmatrix} cov(x,x) & cov(x,y) & cov(x,z)\\ cov(y,x) & cov(y,y) & cov(y,z)\\ cov(z,x) & cov(z,y) & cov(z,z) \end{pmatrix} \] Covariations of the main diagonal turn to variance: \(cov(a,a) = var(a)\)
The matrix is symmetrical about the main diagonal since \(cov(a,b) = cov(b,a)\).

38.2 Basic linear algebra (matrices)

Example of non-eigenvector: \[ \begin{pmatrix} 2 & 3\\ 2 & 1\\ \end{pmatrix} \times \begin{pmatrix} 1\\ 3 \end{pmatrix} = \begin{pmatrix} 2\cdot1+3\cdot3\\ 2\cdot1+1\cdot3 \end{pmatrix} = \begin{pmatrix} 11\\ 5 \end{pmatrix} \] Eigenvector: \[ \begin{pmatrix} 2 & 3\\ 2 & 1\\ \end{pmatrix} \times \begin{pmatrix} 3\\ 2 \end{pmatrix} = \begin{pmatrix} 12\\ 8 \end{pmatrix} = 4 \times \begin{pmatrix} 3\\ 2 \end{pmatrix} \]

### Principal Component Analysis

# Source: https://www.datacamp.com/community/tutorials/pca-analysis-r

# PCA allows to visualize datasets with many variables
# PCA is a linear transformation which simplify many variables into a smaller number of "Principal Components".

### Example 1 "mtcars"
mtcars.pca <- prcomp(mtcars[,c(1:7,10,11)], center = TRUE,scale. = TRUE)

summary(mtcars.pca)
str(mtcars.pca)

library(devtools)
install_github("vqv/ggbiplot")
library(ggbiplot)

ggbiplot(mtcars.pca)
ggbiplot(mtcars.pca, labels=rownames(mtcars))

ggbiplot(mtcars.pca,ellipse=TRUE,choices=c(3,4),   labels=rownames(mtcars), groups=mtcars.country)
ggbiplot(mtcars.pca,ellipse=TRUE,circle=TRUE, labels=rownames(mtcars), groups=mtcars.country)
ggbiplot(mtcars.pca,ellipse=TRUE,obs.scale = 1, var.scale = 1,  labels=rownames(mtcars), groups=mtcars.country)
ggbiplot(mtcars.pca,ellipse=TRUE,obs.scale = 1, var.scale = 1,var.axes=FALSE,   labels=rownames(mtcars), groups=mtcars.country)

### Example 2 "USArrests"

?USArrests               # Violent Crime Rates by US State
dim(USArrests)
dimnames(USArrests)

apply(USArrests,2,mean)   # finding mean of all 
# big variance between samples, need to standartize
pca.out<-prcomp(USArrests,scale=TRUE)
pca.out
summary(pca.out)
names(pca.out)
biplot(pca.out,scale = 0, cex=0.65)

# Principal component analysis

states=row.names(USArrests)
states
names(USArrests)
apply(USArrests, 2, mean)
apply(USArrests, 2, var)
pr.out=prcomp(USArrests, scale=TRUE)
names(pr.out)
pr.out$center
pr.out$scale
pr.out$rotation
dim(pr.out$x)
biplot(pr.out, scale=0)
pr.out$rotation=-pr.out$rotation
pr.out$x=-pr.out$x
biplot(pr.out, scale=0)
pr.out$sdev
pr.var=pr.out$sdev^2
pr.var
pve=pr.var/sum(pr.var)
pve
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')
a=c(1,2,8,-3)
cumsum(a)


# Лабораторная работа 2 к главе 10: Кластеризация

# Кластеризация по методу К средних

set.seed(2)
x=matrix(rnorm(50*2), ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4
km.out=kmeans(x,2,nstart=20)
km.out$cluster
plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2)
set.seed(4)
km.out=kmeans(x,3,nstart=20)
km.out
plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=3", xlab="", ylab="", pch=20, cex=2)
set.seed(3)
km.out=kmeans(x,3,nstart=1)
km.out$tot.withinss
km.out=kmeans(x,3,nstart=20)
km.out$tot.withinss

# Иерархическая кластеризация

hc.complete=hclust(dist(x), method="complete")
hc.average=hclust(dist(x), method="average")
hc.single=hclust(dist(x), method="single")
par(mfrow=c(1,3))
plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=.9)
plot(hc.average, main="Average Linkage", xlab="", sub="", cex=.9)
plot(hc.single, main="Single Linkage", xlab="", sub="", cex=.9)
cutree(hc.complete, 2)
cutree(hc.average, 2)
cutree(hc.single, 2)
cutree(hc.single, 4)
xsc=scale(x)
plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")
x=matrix(rnorm(30*3), ncol=3)
dd=as.dist(1-cor(t(x)))
plot(hclust(dd, method="complete"), main="Complete Linkage with Correlation-Based Distance", xlab="", sub="")


# Лабораторная работа 3 к главе 10: Анализ данных NCI60

# Данные NCI60

library(ISLR)
nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
nci.labs[1:4]
table(nci.labs)

# PCA в приложении к данным NCI60

pr.out=prcomp(nci.data, scale=TRUE)
Cols=function(vec){
    cols=rainbow(length(unique(vec)))
    return(cols[as.numeric(as.factor(vec))])
}
par(mfrow=c(1,2))
plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z2")
plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z3")
summary(pr.out)
plot(pr.out)
pve=100*pr.out$sdev^2/sum(pr.out$sdev^2)
par(mfrow=c(1,2))
plot(pve,  type="o", ylab="PVE", xlab="Principal Component", col="blue")
plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")

# Кластеризация наблюдений из нобора данных NCI60

sd.data=scale(nci.data)
par(mfrow=c(1,3))
data.dist=dist(sd.data)
plot(hclust(data.dist), labels=nci.labs, main="Complete Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="average"), labels=nci.labs, main="Average Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="single"), labels=nci.labs,  main="Single Linkage", xlab="", sub="",ylab="")
hc.out=hclust(dist(sd.data))
hc.clusters=cutree(hc.out,4)
table(hc.clusters,nci.labs)
par(mfrow=c(1,1))
plot(hc.out, labels=nci.labs)
abline(h=139, col="red")
hc.out
set.seed(2)
km.out=kmeans(sd.data, 4, nstart=20)
km.clusters=km.out$cluster
table(km.clusters,hc.clusters)
hc.out=hclust(dist(pr.out$x[,1:5]))
plot(hc.out, labels=nci.labs, main="Hier. Clust. on First Five Score Vectors")
table(cutree(hc.out,4), nci.labs)

38.2.1 t-SNE - Stochastic Neighbor Embedding

library(Rtsne)

# data loading
train<- read.csv("~/github/dar/012_t-SNE/DATA/train.csv")

head(train)[1:20]
dim(train)

## get labels as factors
labels <- train$label
train$label<-as.factor(train$label)

## for plotting
colors = rainbow(length(unique(train$label)))
names(colors) = unique(train$label)

## Executing the algorithm on curated data
tsne <- Rtsne(train[,-1], dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)

## Execution time
exeTimeTsne<- system.time(Rtsne(train[,-1], dims = 2, perplexity=30, verbose=TRUE, max_iter = 500))

## Plotting
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels=train$label, col=colors[train$label])

Sources

https://www.analyticsvidhya.com/blog/2017/01/t-sne-implementation-r-python/