Chapter 2 Statistics R functions reference
2.1 Get data
2.2 Data inspection
head(df)
typeof(df) # data type
dim(df) # dimention
nrow(df) # number of rows
ncol(df) # number of columns
str(df) # data structure
summary(df) # data summary
names(df) # names of columns
colnames(df) # also column names
table(df) # frequency of categorical data
2.3 Plots
plot(x ~ y)
barplot(df)
boxplot(v)
hist(x)
pie(groupsize, labels, col, ...)
2.4 Analysis of the distribution
# mode
<- function(v) {
getmode <- unique(v)
uniqv which.max(tabulate(match(v, uniqv)))]
uniqv[
}
getmode(v) # mode
mean(v) # mean
mean(v, trim=0.1) # trimmed mean
median(v) # median
min(v)
max(v)
range(v) # c(mun(v), max(v))
max(v)-min(v) # range
sort(v)
rank(v)
sample(v) # subsample
sample(v, size)
var(v) # variance
sd(v) # standard deviation
cor(v) # correlation
cov(v) # covariation
scale(v) # z-scores
quantile(v)
IQR(v) # interquantile range: IQR = Q3 – Q1
qqnorm(v) # normal probability plot
qqline(v) # adds a line to a normal probability plot passing through 1Q and 3Q
2.5 Distributions
d - probability density
p - cumulative distribution
q - quantile function (inverse cumulative distribution)
r - random variables from distribution
# Normal distribution
dnorm(x, mean, sd) # Probability Density Function (PDF)
pnorm(q, mean, sd) # Cumulative Distribution Function (CDF)
qnorm(p, mean, sd) # quantile function - inverse of pnorm
rnorm(n, mean, sd) # random numbers from normal distribution
# Binomial
dbinom(x) # Probability Density Function (PDF)
pbinom(q) # Cumulative Distribution Function (CDF)
qbinom(p) # quantile function - inverse of pnorm
rbinom(n) # random numbers from normal distribution
# Poisson
dpois()
ppois()
qpois()
rpois()
# Exponential
dexp()
pexp()
qexp()
rexp()
# Chi-squared distribution
dchisq(v, df) # density of the interval
pchisq() # distribution of the interval (AUC)
qchisq() # quantiles
rchisq() # random deviates
2.6 t-Test
t.test(x, mu = 0, alternative = c("two.sided", "less", "greater"),
paired = FALSE, var.equal = FALSE, conf.level = 0.95)
t.test(v, mu) # one-sample t-test, mu - null hypothesized value
t.test(v1, v2) # two-sample t-test
t.test(v1, v2, var.equal=T)
t.test(var1, var2, paired=T)
wilcox.test(v1, v2, paired=T)
2.7 ANOVA
# One way ANOVA
oneway.test(x ~ f)
aov(x ~ f)
anova(m1, m2) # compair two models
2.8 Machine Learning Functions Reference
2.8.1 Linear Regression
<- lm(y ∼ x1 + x2, data=as.data.frame(cbind(y,x1,x2)))
lm_model summary(lm_model)
lm(y ~ x1 + x2 + x3) # multiple linear regression
lm(log(y) ~ x) # log transformed
lm(sqrt(y) ~ x) # sqrt transformed
lm( y ~ log(x)) # fields transformed
llm(log(y) ~ log(x)) # everything is transformed
lm(y ~ .) # use all fields for regression model
lm(y ~ x + 0) # forced zero intercept
lm(y ~ x*k) # interaction of two variables
lm(y ~ x + k + x:k) # product of xkl but without interaction
lm(y ~ (x + k + ... + l)^2) # all first order interactions
lm(y ~ I(x1 + x2)) # sum of variables
lm(y ~ I(x1^2)) # product of variables (not interation)
lm(y ~ x + I(x^2) + I(x^3)) # polynomial regression
lm(y ~ poly(x,3)) # same as previous
# Forward/backward stepwise regression
# improve model
<- lm(y ~ x1 + x2)
fit <- step(fit, direction = 'backward')
bwd.fit <- step(fit, direction = 'forward', scope( ~ x1 + x2))
fwd.fit
# Test linear model
plot(m) # plot residuals
::outlier.test(m)
cardwtest(m) # Durbin-Watson Test of the model residuals
# Prediction
<- predict(lm_model, newdata=as.data.frame(cbind(x1_test, x2_test)))
predicted_values
# Apriori
<- read.csv("C:\\Datasets\\mushroom.csv", header = TRUE)
dataset <- apriori(as.matrix(dataset), parameter = list(supp = 0.8, conf = 0.9))
mushroom_rules summary(mushroom_rules)
inspect(mushroom_rules)
# Logistic Regression
glm()
<-glm(y ∼ x1+x2, family=binomial(link="logit"), data=as.data.frame(cbind(y,x1,x2)))
glm_mod
# K-Means Clustering
<- kmeans(x=X, centers=m)
kmeans_model
# k-Nearest Neighbor Classification
<- knn(train=X_train, test=X_test, cl=as.factor(labels), k=K)
knn_model
# Naıve Bayes
library(e1071)
<- naiveBayes(y ∼ x1 + x2, data=as.data.frame(cbind(y,x1,x2)))
nB_model
# Decision Trees (CART)
library(rpart)
# Tree-based models
rpart(formula, data, method,control)
~predictor1+predictor2+predictor3+etc.
formula outcome
data data frame'class' for a classification tree
method 'anova' for a regression tree
for controlling tree growth.
control optional parameters = 50 - minimal number of observation in a node
minsplit =0.001 - cost coplexity factor
cp
<- rpart(y ∼ x1 + x2, data=as.data.frame(cbind(y,x1,x2)), method="class")
cart_model
printcp(fit) # display cp table
plotcp(fit) # plot cross-validation results
rsq.rpart(fit) # plot approximate R-squared and relative error for different splits (2 plots). labels are only appropriate for the "anova" method.
print(fit) # print results
summary(fit) # detailed results including surrogate splits
plot(fit) # plot decision tree
text(fit) # label the decision tree plot
post(fit, file=) # create postscript plot of decision tree
plot.rpart(cart_model)
text.rpart(cart_model)
# AdaBoost
# boosting functions - uses decision trees as base classifiers
library(rpart)
library(ada)
# Let X be the matrix of features, and labels be a vector of 0-1 class labels.
<- ada(x=X, y=labels)
boost_model
# Support Vector Machines (SVM)
library(e1071)
# Let X be the matrix of features, and labels be a vector of 0-1 class labels. Let the regularization parameter be C. Use the following commands to generate the SVM model and view details:
<- svm(x=X, y=as.factor(labels), kernel ="radial", cost=C)
svm_model summary(svm_model)