Chapter 31 Logistic Regression
The logistic model (or logit model) is a statistical model with input (independent variable) a continuous variable and output (dependent variable) a binary variable (discret choice, e.g. yes/no or 1/0).
31.1 Confusion matrix
A confusion matrix is a table that is often used to describe the performance of a classification model (or “classifier”) on a set of test data for which the true values are known.
n=165 | Predicted: NO | Predicted: Yes- |
---|---|---|
Actual: No | TN = 50 | FP = 10 |
Actual: Yes | FN = 5 | TP = 100 |
TN - true negatives
TP - true positives
FN - false negatives
FP - false posistives
Accuracy - Overall, how often is the classifier correct?
(TP+TN)/total = (100+50)/165 = 0.91
Misclassification Rate - Overall, how often is it wrong?
(FP+FN)/total = (10+5)/165 = 0.09
equivalent to 1 - Accuracy
also known as “Error Rate”
True Positive Rate: When it’s actually yes, how often does it predict yes? TP/actual yes = 100/105 = 0.95 also known as “Sensitivity” or “Recall”
False Positive Rate: When it’s actually no, how often does it predict yes? FP/actual no = 10/60 = 0.17
True Negative Rate: When it’s actually no, how often does it predict no? TN/actual no = 50/60 = 0.83 equivalent to 1 minus False Positive Rate also known as “Specificity”
Precision: When it predicts yes, how often is it correct? TP/predicted yes = 100/110 = 0.91
Prevalence: How often does the yes condition actually occur in our sample? actual yes/total = 105/165 = 0.64
Null Error Rate: This is how often you would be wrong if you always predicted the majority class. (In our example, the null error rate would be 60/165=0.36 because if you always predicted yes, you would only be wrong for the 60 “no” cases.) This can be a useful baseline metric to compare your classifier against. However, the best classifier for a particular application will sometimes have a higher error rate than the null error rate, as demonstrated by the Accuracy Paradox.
Cohen’s Kappa: This is essentially a measure of how well the classifier performed as compared to how well it would have performed simply by chance. In other words, a model will have a high Kappa score if there is a big difference between the accuracy and the null error rate. (More details about Cohen’s Kappa.)
F Score: This is a weighted average of the true positive rate (recall) and precision. (More details about the F Score.)
ROC Curve: This is a commonly used graph that summarizes the performance of a classifier over all possible thresholds. It is generated by plotting the True Positive Rate (y-axis) against the False Positive Rate (x-axis) as you vary the threshold for assigning observations to a given class. (More details about ROC Curves.)
Example:
A group of 20 students spend between 0 and 6 hours studying for an exam. How does the number of hours spent studying affect the probability that the student will pass the exam?
<- c(0.5, 0.75, 1, 1.25, 1.5, 1.75, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 4, 4.25, 4.5, 4.75, 5, 5.5)
hours <- c(0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1)
pass = glm(pass ~ hours, family = binomial)
model <- data.frame(hours=seq(min(hours), max(hours),len=100))
newdat $pass = predict(model, newdata=newdat, type="response")
newdat
# plot
plot(pass ~ hours)
lines(pass ~ hours, newdat, col="red")
# data
<- data.frame(hours=c(0.5, 0.75, 1, 1.25, 1.5, 1.75, 1.75, 2, 2.25, 2.5,
data 2.75, 3, 3.25, 3.5, 4, 4.25, 4.5, 4.75, 5, 5.5),
pass=c(0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1),
pass.predic = rep(NA, 20), # slot for predicted pass
pass.logit = rep(NA, 20)) # slot logit prediction
# model
<- glm(data$pass ~ data$hours, family = binomial)
model
# predict values of logit function
$pass.logit <- predict(model, newdata=data, type='response')
data# predict pass for threshold = 0.5
$pass.predic <- ifelse(data$pass.logit > 0.5, 1, 0)
data
# Confusion matrix
library(caret)
::confusionMatrix(data = factor(data$pass.predic),
caretreference = factor(data$pass))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 2
## 1 2 8
##
## Accuracy : 0.8
## 95% CI : (0.5634, 0.9427)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.005909
##
## Kappa : 0.6
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8
## Specificity : 0.8
## Pos Pred Value : 0.8
## Neg Pred Value : 0.8
## Prevalence : 0.5
## Detection Rate : 0.4
## Detection Prevalence : 0.5
## Balanced Accuracy : 0.8
##
## 'Positive' Class : 0
##
<- seq(0,1, by=0.05)
p <- data.frame(probability=p, odds=p/(1-p))
data head(data)
## probability odds
## 1 0.00 0.00000000
## 2 0.05 0.05263158
## 3 0.10 0.11111111
## 4 0.15 0.17647059
## 5 0.20 0.25000000
## 6 0.25 0.33333333
plot(data$odds~data$probability, type='o', pch=19, xlab='Probability', ylab='Odds')
plot(log(data$odds)~data$odds, type='o', pch=19, xlab='Odds', ylab='log(odds)')
plot(data$probability~log(data$odds), type='o', pch=19, xlab='log(odds)', ylab='Probability')
31.2 Next part
library(data.table)
<- fread('https://raw.githubusercontent.com/suvarzz/data/master/data_classification.csv', header=T, sep=",")
df head(df)
plot(df[pass==1][,!3], col='red')
points(df[pass==0][,!3], col='blue')
<- glm(pass ~ studied + slept, data = df, family = 'binomial')
model.logit summary(model.logit)
<- predict(model.logit, df, type = 'response')
p.lda $predicted <- ifelse(p.lda > 0.5, 1, 0)
dfhead(df)
=-coef(model.logit)[1]/coef(model.logit)[2],
a=-coef(model.logit)[1]/coef(model.logit)[3])
b
= coef(model.logit)[1]
b0 = mymodel$coefficients[[2]]
b1 = mymodel$coefficients[[3]]
b2 = b0 + (b1 * 1) + (b2 * 4)
z = 1 / (1 + exp(-z))
p
if p=0.5 => z = 0 => b0 + b1*x + b2*y =>
segments(0,10.87,9.26,0)
=(3.77-0.474*studied)/0.338
slept0, 3.77/0.474) = (9.2, 0)
(3.77/0.474,0) = (0, 10.87)
(
segments(9.2,0, 0,10.87, lwd=2)
31.3 NExt part
# Example of logistic regression
# Source: 17 - Анализ данных в R. Логистическая регрессия by Anatoliy Karpov
# Read data set train.csv:
# Statistics of students in a school
# gender - male/femail
# read, write, math - points for subjects
# hon - if honorary degree Y/N
# FIX combine train and test into one csv file. Split train and test inside this script
setwd("~/RData")
<- read.csv("train.csv", sep=";")
df
# Visual inspection of the dataset
head(df)
str(df)
View(df)
# N-not the best mark, Y-the best mark
library(ggplot2)
ggplot(df, aes(read,math,col=gender))+geom_point()+facet_grid(.~hon)+
theme(axis.text=element_text(size=25), axis.title=element_text(size=25, face='bold'))
# Apply logistic regression
# How hon depends on different variables: read, math, gender
<- glm(hon ~ read + math + gender, df, family = "binomial")
fit summary(fit)
# Meanings of coefficients:
# read-estimate: 0.06677 - if female, math is fixed, if read change to 1, then ln(odds) will be changed to 0.06677
# Get data from fit
exp(fit$coefficients)
# Predict model - ln(odds)
head(predict(object=fit))
# Predict model - return probability to get the best mark for every person
head(predict(object = fit, type = "response"))
# Add probabilities to get the best mark for every person in df
$prob <- predict(object = fit, type = "response")
df
df
# Part 2
# ROC-curve of predicted model
library(ROCR)
# Predicted values and real values
<- prediction(df$prob, df$hon)
pred_fit
# Calculate tpr - true positive rate and fpr - false positive rate
<- performance(pred_fit, "tpr", "fpr")
perf_fit
# plot ROC-curve
plot(perf_fit, colorize=T, print.cutoffs.at = seq(0,1, by=0.1))
# Area under the curve: 0.87
<- performance(pred_fit, measure = "auc")
auc str(auc)
# How to detect the border and make a decision if student will get honorary degree
# Specificity - how good we can predict negative results
<- performance(pred_fit, x.measure = "cutoff", measure = "spec")
perf3 # Sencitivity - how good we can predict positive results
<- performance(pred_fit, x.measure = "cutoff", measure = "sens")
perf4 # Общая интенсивность классификатора
<- performance(pred_fit, x.measure = "cutoff", measure = "acc")
perf5
plot(perf3, col = 'red', lwd = 2)
plot(add=T, perf4, col = "green", lwd = 2)
plot(add=T, perf5, lwd = 2)
legend(x = 0.6, y = 0.3, c("spec", "sens", "accur"), lty=1, col=c("red", "green", "black"),
bty='n', cex=1, lwd=2)
abline(v=0.255, lwd=2)
# The border is the intersection of all three curves
# Add column with prediced values Y/N
$pred_resp <- factor(ifelse(df$prob > 0.255, 1, 0), labels=c("N", "Y"))
df
# 1 if prediction is correct, 0 if not correct
$correct <- ifelse(df$pred_resp == df$hon, 1, 0)
df
df
# blue - correct classified, red - incorrect classified
# it is more difficult to predict positive result
ggplot(df, aes(prob, fill = factor(correct)))+
geom_dotplot()+
theme(axis.text=element_text(size=25),
axis.title=element_text(size=25, face="bold"))
# Percent of positive predictions
mean(df$correct)
# Part 3 - Prediction using test data
<- read.csv("test.csv", sep=";")
test_df
test_df
# Predict honorary members
$prob_predict <- predict(fit, newdata=test_df, type="response")
test_df$pred_resp <- factor(ifelse(test_df$prob_predict > 0.255, 1, 0), labels=c("N", "Y"))
test_df test_df