Chapter 2 Statistics R functions reference

2.1 Get data

2.2 Data inspection

head(df)
typeof(df)  # data type
dim(df)     # dimention
nrow(df)    # number of rows
ncol(df)    # number of columns
str(df)     # data structure
summary(df) # data summary
names(df)   # names of columns
colnames(df) # also column names

table(df)   # frequency of categorical data

2.3 Plots

plot(x ~ y)
barplot(df)
boxplot(v)
hist(x)
pie(groupsize, labels, col, ...)

2.4 Analysis of the distribution

# mode
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

getmode(v)        # mode
mean(v)           # mean
mean(v, trim=0.1) # trimmed mean
median(v)         # median

min(v)
max(v)
range(v)          # c(mun(v), max(v))
max(v)-min(v)   # range

sort(v)
rank(v)
sample(v)        # subsample
sample(v, size)

var(v)            # variance
sd(v)             # standard deviation
cor(v)            # correlation
cov(v)            # covariation
scale(v)          # z-scores

quantile(v)
IQR(v)            # interquantile range: IQR = Q3 – Q1

qqnorm(v)         # normal probability plot
qqline(v)         # adds a line to a normal probability plot passing through 1Q and 3Q

2.5 Distributions

d - probability density
p - cumulative distribution
q - quantile function (inverse cumulative distribution)
r - random variables from distribution

# Normal distribution
dnorm(x, mean, sd)     # Probability Density Function (PDF)
pnorm(q, mean, sd)     # Cumulative Distribution Function (CDF)
qnorm(p, mean, sd)     # quantile function - inverse of pnorm
rnorm(n, mean, sd)     # random numbers from normal distribution

# Binomial
dbinom(x)     # Probability Density Function (PDF)
pbinom(q)     # Cumulative Distribution Function (CDF)
qbinom(p)     # quantile function - inverse of pnorm
rbinom(n)     # random numbers from normal distribution

# Poisson
dpois()
ppois()
qpois()
rpois()

# Exponential
dexp()
pexp()
qexp()
rexp()

# Chi-squared distribution
dchisq(v, df)     # density of the interval
pchisq()          # distribution of the interval (AUC)
qchisq()          # quantiles
rchisq()          # random deviates

2.6 t-Test

t.test(x, mu = 0, alternative = c("two.sided", "less", "greater"), 
       paired = FALSE, var.equal = FALSE, conf.level = 0.95)
t.test(v, mu)     # one-sample t-test, mu - null hypothesized value
t.test(v1, v2)    # two-sample t-test
t.test(v1, v2, var.equal=T)
t.test(var1, var2, paired=T)
wilcox.test(v1, v2, paired=T)

2.7 ANOVA

# One way ANOVA
oneway.test(x ~ f)
aov(x ~ f)

anova(m1, m2)      # compair two models

2.8 Machine Learning Functions Reference

2.8.1 Linear Regression

lm_model <- lm(y ∼ x1 + x2, data=as.data.frame(cbind(y,x1,x2)))
summary(lm_model)

lm(y ~ x1 + x2 + x3)     # multiple linear regression

lm(log(y) ~ x)           # log transformed
lm(sqrt(y) ~ x)          # sqrt transformed
lm( y ~ log(x))          # fields transformed
llm(log(y) ~ log(x))     # everything is transformed

lm(y ~ .)                # use all fields for regression model

lm(y ~ x + 0)            # forced zero intercept

lm(y ~ x*k)              # interaction of two variables

lm(y ~ x + k + x:k)      # product of xkl but without interaction

lm(y ~ (x + k + ... + l)^2) # all first order interactions

lm(y ~ I(x1 + x2))       # sum of variables
lm(y ~ I(x1^2))          # product of variables (not interation)

lm(y ~ x + I(x^2) + I(x^3)) # polynomial regression
lm(y ~ poly(x,3))           # same as previous

# Forward/backward stepwise regression
# improve model
fit <- lm(y ~ x1 + x2)
bwd.fit <- step(fit, direction = 'backward')
fwd.fit <- step(fit, direction = 'forward', scope( ~ x1 + x2))

# Test linear model
plot(m)            # plot residuals
car::outlier.test(m)
dwtest(m)          # Durbin-Watson Test of the model residuals

# Prediction
predicted_values <- predict(lm_model, newdata=as.data.frame(cbind(x1_test, x2_test)))

# Apriori
dataset <- read.csv("C:\\Datasets\\mushroom.csv", header = TRUE)
mushroom_rules <- apriori(as.matrix(dataset), parameter = list(supp = 0.8, conf = 0.9))
summary(mushroom_rules)
inspect(mushroom_rules)

# Logistic Regression
glm()
glm_mod <-glm(y ∼ x1+x2, family=binomial(link="logit"), data=as.data.frame(cbind(y,x1,x2))) 

# K-Means Clustering
kmeans_model <- kmeans(x=X, centers=m)

# k-Nearest Neighbor Classification
knn_model <- knn(train=X_train, test=X_test, cl=as.factor(labels), k=K)

# Naıve Bayes
library(e1071)
nB_model <- naiveBayes(y ∼ x1 + x2, data=as.data.frame(cbind(y,x1,x2)))

# Decision Trees (CART)
library(rpart)
# Tree-based models

rpart(formula, data, method,control)

formula outcome~predictor1+predictor2+predictor3+etc.
data    data frame
method 'class' for a classification tree
       'anova' for a regression tree
control optional parameters for controlling tree growth.
        minsplit = 50 - minimal number of observation in a node
        cp=0.001 - cost coplexity factor

cart_model <- rpart(y ∼ x1 + x2, data=as.data.frame(cbind(y,x1,x2)), method="class")
        
printcp(fit)     # display cp table
plotcp(fit)      # plot cross-validation results
rsq.rpart(fit)   # plot approximate R-squared and relative error for different splits (2 plots). labels are only appropriate for the "anova" method.
print(fit)       # print results
summary(fit)     # detailed results including surrogate splits
plot(fit)        # plot decision tree
text(fit)        # label the decision tree plot
post(fit, file=) # create postscript plot of decision tree

plot.rpart(cart_model)
text.rpart(cart_model)

# AdaBoost
# boosting functions - uses decision trees as base classifiers
library(rpart)
library(ada)
# Let X be the matrix of features, and labels be a vector of 0-1 class labels.
boost_model <- ada(x=X, y=labels)

# Support Vector Machines (SVM)
library(e1071)
# Let X be the matrix of features, and labels be a vector of 0-1 class labels. Let the regularization parameter be C. Use the following commands to generate the SVM model and view details:
svm_model <- svm(x=X, y=as.factor(labels), kernel ="radial", cost=C)
summary(svm_model)