Chapter 32 Models for binary Data
For binary dependent variables build models:
1. LR - Logistic regression
2. LDA - Linear discriminant analysis
3. QDA - quadrat discriminant analysis
How to detect the border of decision, use ROC-curves
library('ISLR')
library('GGally')
library('MASS')
<- 12345
my.seed <- 0.85
train.percent options("ggmatrix.progress.bar" = FALSE)
# Data set from ISLR: is credits returned Y/N dependent on: student, average balance and income.
head(Default)
str(Default)
### Primary analysis
# Scatter plots for primary analysis
<- ggpairs(Default)
ggp print(ggp, progress = FALSE)
# rate of variables in default column
table(Default$default) / sum(table(Default$default))
### Train data subset
set.seed(my.seed)
<- sample(seq_along(Default$default),
inTrain nrow(Default)*train.percent)
<- Default[inTrain, ]
df # actual values on the train data
<- df$default
fact tail(fact, n=20)
### 1. Logistic regression modeling
<- glm(default ~ balance, data = df, family = 'binomial')
model.logit summary(model.logit)
# prognosis of probability belong to the class 'Yes'
<- predict(model.logit, df, type = 'response')
p.logit <- factor(ifelse(p.logit > 0.5, 2, 1),
prognosis levels = c(1, 2),
labels = c('No', 'Yes'))
# confusion matrix
<- table(fact, prognosis)
conf.m
conf.m
# sensitivity
2, 2] / sum(conf.m[2, ])
conf.m[
# spesifisity
1, 1] / sum(conf.m[1, ])
conf.m[
# accuracy
sum(diag(conf.m)) / sum(conf.m)
###. 2. LDA model
<- lda(default ~ balance, data = Default[inTrain, ])
model.lda
model.lda
# prognosis of probability belong to the class 'Yes'
<- predict(model.lda, df, type = 'response')
p.lda <- factor(ifelse(p.lda$posterior[, 'Yes'] > 0.5,
prognosis 2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# confusion matrix
<- table(fact, prognosis)
conf.m
conf.m
2, 2] / sum(conf.m[2, ]) # sensitivity
conf.m[1, 1] / sum(conf.m[1, ]) # specitivity
conf.m[sum(diag(conf.m)) / sum(conf.m) # accuracy
### 3. QDA modeling
<- qda(default ~ balance, data = Default[inTrain, ])
model.qda
model.qda# prognosis of probability belong to the class 'Yes'
<- predict(model.qda, df, type = 'response')
p.qda <- factor(ifelse(p.qda$posterior[, 'Yes'] > 0.5,
Прогноз 2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# confusion matrix
<- table(Факт, Прогноз)
conf.m
conf.m
2, 2] / sum(conf.m[2, ]) # sensitivity
conf.m[1, 1] / sum(conf.m[1, ]) # specitivity
conf.m[sum(diag(conf.m)) / sum(conf.m) # accuracy
### 4. ROC-curve and detection of decision border
# calculate 1-SPC and TPR for all varients for clipping boundary
<- NULL # for (1 - SPC)
x <- NULL # for TPR
y
# template for the confusion matrix
<- as.data.frame(matrix(rep(0, 4), 2, 2))
tbl rownames(tbl) <- c('fact.No', 'fact.Yes')
colnames(tbl) <- c('predict.No', 'predict.Yes')
# vector of probabilities
<- seq(0, 1, length = 501)
p.vector
# go through the vector of probabilities
for (p in p.vector){
# prognosis
<- factor(ifelse(p.lda$posterior[, 'Yes'] > p,
prognosis 2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# data frame containing fact and prognosis
<- data.frame(fact = fact, prognosis = prognosis)
df.compare
# fill the confusion matrix
1, 1] <- nrow(df.compare[df.compare$fact == 'No' & df.compare$prognosis == 'No', ])
tbl[2, 2] <- nrow(df.compare[df.compare$fact == 'Yes' & df.compare$prognosis == 'Yes', ])
tbl[1, 2] <- nrow(df.compare[df.compare$fact == 'No' & df.compare$prognosis == 'Yes', ])
tbl[2, 1] <- nrow(df.compare[df.compare$fact == 'Yes' & df.compare$prognosis == 'No', ])
tbl[
# calculate characteristics
<- tbl[2, 2] / sum(tbl[2, 2] + tbl[2, 1])
TPR <- c(y, TPR)
y <- tbl[1, 1] / sum(tbl[1, 1] + tbl[1, 2])
SPC <- c(x, 1 - SPC)
x
}# build ROC-curve
par(mar = c(5, 5, 1, 1))
# curve
plot(x, y, type = 'l', col = 'blue', lwd = 3,
xlab = '(1 - SPC)', ylab = 'TPR',
xlim = c(0, 1), ylim = c(0, 1))
# line for the classifier
abline(a = 0, b = 1, lty = 3, lwd = 2)
# point for the probability 0.5
points(x[p.vector == 0.5], y[p.vector == 0.5], pch = 16)
text(x[p.vector == 0.5], y[p.vector == 0.5], 'p = 0.5', pos = 4)
# point for the probability 0.2
points(x[p.vector == 0.2], y[p.vector == 0.2], pch = 16)
text(x[p.vector == 0.2], y[p.vector == 0.2], 'p = 0.2', pos = 4)
<- factor(ifelse(p.lda$posterior[, 'Yes'] > 0.2,
prognosis 2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
<- table(fact, prognosis)
conf.m
conf.m
# sensitivity
2, 2] / sum(conf.m[2, ])
conf.m[# specificity
1, 1] / sum(conf.m[1, ])
conf.m[# accuracy
sum(diag(conf.m)) / sum(conf.m)
Sources
Course ‘Math modeling’ practical work, State University of Management, Moscow