Chapter 32 Models for binary Data
For binary dependent variables build models:
1. LR - Logistic regression
2. LDA - Linear discriminant analysis
3. QDA - quadrat discriminant analysis
How to detect the border of decision, use ROC-curves
library('ISLR')
library('GGally')
library('MASS')
my.seed <- 12345
train.percent <- 0.85
options("ggmatrix.progress.bar" = FALSE)
# Data set from ISLR: is credits returned Y/N dependent on: student, average balance and income.
head(Default)
str(Default)
### Primary analysis
# Scatter plots for primary analysis
ggp <- ggpairs(Default)
print(ggp, progress = FALSE)
# rate of variables in default column
table(Default$default) / sum(table(Default$default))
### Train data subset
set.seed(my.seed)
inTrain <- sample(seq_along(Default$default),
nrow(Default)*train.percent)
df <- Default[inTrain, ]
# actual values on the train data
fact <- df$default
tail(fact, n=20)
### 1. Logistic regression modeling
model.logit <- glm(default ~ balance, data = df, family = 'binomial')
summary(model.logit)
# prognosis of probability belong to the class 'Yes'
p.logit <- predict(model.logit, df, type = 'response')
prognosis <- factor(ifelse(p.logit > 0.5, 2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# confusion matrix
conf.m <- table(fact, prognosis)
conf.m
# sensitivity
conf.m[2, 2] / sum(conf.m[2, ])
# spesifisity
conf.m[1, 1] / sum(conf.m[1, ])
# accuracy
sum(diag(conf.m)) / sum(conf.m)
###. 2. LDA model
model.lda <- lda(default ~ balance, data = Default[inTrain, ])
model.lda
# prognosis of probability belong to the class 'Yes'
p.lda <- predict(model.lda, df, type = 'response')
prognosis <- factor(ifelse(p.lda$posterior[, 'Yes'] > 0.5,
2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# confusion matrix
conf.m <- table(fact, prognosis)
conf.m
conf.m[2, 2] / sum(conf.m[2, ]) # sensitivity
conf.m[1, 1] / sum(conf.m[1, ]) # specitivity
sum(diag(conf.m)) / sum(conf.m) # accuracy
### 3. QDA modeling
model.qda <- qda(default ~ balance, data = Default[inTrain, ])
model.qda
# prognosis of probability belong to the class 'Yes'
p.qda <- predict(model.qda, df, type = 'response')
Прогноз <- factor(ifelse(p.qda$posterior[, 'Yes'] > 0.5,
2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# confusion matrix
conf.m <- table(Факт, Прогноз)
conf.m
conf.m[2, 2] / sum(conf.m[2, ]) # sensitivity
conf.m[1, 1] / sum(conf.m[1, ]) # specitivity
sum(diag(conf.m)) / sum(conf.m) # accuracy
### 4. ROC-curve and detection of decision border
# calculate 1-SPC and TPR for all varients for clipping boundary
x <- NULL # for (1 - SPC)
y <- NULL # for TPR
# template for the confusion matrix
tbl <- as.data.frame(matrix(rep(0, 4), 2, 2))
rownames(tbl) <- c('fact.No', 'fact.Yes')
colnames(tbl) <- c('predict.No', 'predict.Yes')
# vector of probabilities
p.vector <- seq(0, 1, length = 501)
# go through the vector of probabilities
for (p in p.vector){
# prognosis
prognosis <- factor(ifelse(p.lda$posterior[, 'Yes'] > p,
2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
# data frame containing fact and prognosis
df.compare <- data.frame(fact = fact, prognosis = prognosis)
# fill the confusion matrix
tbl[1, 1] <- nrow(df.compare[df.compare$fact == 'No' & df.compare$prognosis == 'No', ])
tbl[2, 2] <- nrow(df.compare[df.compare$fact == 'Yes' & df.compare$prognosis == 'Yes', ])
tbl[1, 2] <- nrow(df.compare[df.compare$fact == 'No' & df.compare$prognosis == 'Yes', ])
tbl[2, 1] <- nrow(df.compare[df.compare$fact == 'Yes' & df.compare$prognosis == 'No', ])
# calculate characteristics
TPR <- tbl[2, 2] / sum(tbl[2, 2] + tbl[2, 1])
y <- c(y, TPR)
SPC <- tbl[1, 1] / sum(tbl[1, 1] + tbl[1, 2])
x <- c(x, 1 - SPC)
}
# build ROC-curve
par(mar = c(5, 5, 1, 1))
# curve
plot(x, y, type = 'l', col = 'blue', lwd = 3,
xlab = '(1 - SPC)', ylab = 'TPR',
xlim = c(0, 1), ylim = c(0, 1))
# line for the classifier
abline(a = 0, b = 1, lty = 3, lwd = 2)
# point for the probability 0.5
points(x[p.vector == 0.5], y[p.vector == 0.5], pch = 16)
text(x[p.vector == 0.5], y[p.vector == 0.5], 'p = 0.5', pos = 4)
# point for the probability 0.2
points(x[p.vector == 0.2], y[p.vector == 0.2], pch = 16)
text(x[p.vector == 0.2], y[p.vector == 0.2], 'p = 0.2', pos = 4)
prognosis <- factor(ifelse(p.lda$posterior[, 'Yes'] > 0.2,
2, 1),
levels = c(1, 2),
labels = c('No', 'Yes'))
conf.m <- table(fact, prognosis)
conf.m
# sensitivity
conf.m[2, 2] / sum(conf.m[2, ])
# specificity
conf.m[1, 1] / sum(conf.m[1, ])
# accuracy
sum(diag(conf.m)) / sum(conf.m)Sources
Course ‘Math modeling’ practical work, State University of Management, Moscow