Chapter 9 Primary data analysis - Case studies
Abbakumov, 2016, lectures
# GET DATA
<- read.table("./DATA/Swiss_Bank_Notes.csv", header=T, sep=" ", dec=",")
df head(df)
# Data explanation: parameters of Swiss Banknotes
# Size of data: 200 (100 are real, 100 are false)
# Length - length
# H_l - height left
# H_r - height right
# dist_l - border left
# dist_up - border up
# Diag - diagonal
# ? Find false banknotes
# 1. Let's add a column with 100 filled 0 and 100 filled with 1.
<- 0
origin <- data.frame(df, origin)
df $origin[1:100] <-1
df# Set origin as factor - binary data (0,1)
$origin <- as.factor(df$origin)
dfis.factor(df$origin)
# 2. HISTORGRAM
par(mfrow=c(length(colnames(df))/2,2))
for (i in 1:length(colnames(df))) { hist(df[,i], main = paste(colnames(df)[i])) }
# Histogram for Diagonals
par(mfrow=c(1,1))
hist(df$Diag, breaks=18, probability=TRUE)
# Barplot
barplot(VADeaths, beside=TRUE, legend=TRUE, ylim=c(0, 100),
ylab="Deaths per 1000", main="Death rates in Virginia")
# Pieplot
<- c(18, 30, 32, 10, 10)
groupsize <- c("A", "B", "C", "D", "F")
labels pie(groupsize, labels, col=c("red", "white", "grey", "black", "blue"))
# All pairs of data scatter plot
plot(df)
# Length ~ Dial
plot(df$Length, df$Diag)
# true notes
points(df$Length[df$origin==1],
$Length[df$origin==0], pch=3, col="green")
df# false notes
points(df$Length[df$origin==1],
$Length[df$origin==0], pch=1, col="red")
df
# If factors are given, plot makes boxplot
plot(df$origin, df$Diag)
title("Swiss Bank Notes")
# GET DATA - TOWNS
<- read.table("DATA/town_1959_2.csv", header=T, sep="\t", dec=".")
town
townsummary(town)
# Median is more stable to outliers
summary(town[,3])
# lets remove 2 first towns from the data
summary(town[3:1004,3])
hist(town[,3])
# log scale allows us to see outliers better
hist(log(town[,3]), breaks=50)
# Truncated mean is better than mean
mean(town[,3], trim=0.05)
# GET DATA - BASKETBALL
<- read.table("DATA/basketball.csv", header=F, sep=";", dec=".")
bb
bb# NBA Player characteristics:
# percent of positives vs:
# SF - light forvard
# PF - heavy forvard
# C - center
# G - defender
summary(bb[,1])
par(mfrow=c(1,1))
plot(bb[,1]~bb[,2])
par(mfrow=c(2,2))
for (i in 1:4) { hist(bb[,1], breaks=5*i, main=paste("Breaks", 5*i), ylab="") }
for (i in unique(bb[,2])) {
hist(bb[bb[,2]==i ,1], breaks=6,
xlim=c(min(bb[,1])-5, max(bb[,1]+5)), col="white", main=i, ylab="")
}# Conclusion: for several groups of data boxplots may be more informative than histograms