Chapter 23 Estimate model accuracy

23.1 Continuous variables

# Estimate model precision for one continious dependent variable (Y)
# Source: Course 'Math modeling' practical work, State University of Management, Moscow
# link: https://sites.google.com/a/kiber-guu.ru/r-practice/home

### GENERATE DATA
# Generate data for given function
y.func <- function(x) {4 - 2e-02*x + 5.5e-03*x^2 - 4.9e-05*x^3}

### Parameters for data generation
my.seed <- 123         # set seed
n.all <- 100           # number of samples 
train.percent <- 0.85  # percent of train data
res.sd <- 1            # standard deviation of noise 

x.min <- 5             # minimal value
x.max <- 104           # maximal value

# generate random x values
set.seed(my.seed)
x <- runif(x.min, x.max, n = n.all)

# set normally distributed noise for each value
set.seed(my.seed)
res <- rnorm(mean = 0, sd = res.sd, n = n.all)

# calculate y values using funcion and add noise
y <- y.func(x) + res

# vector for subseting train data (contains numbers of selected elements in x)
set.seed(my.seed)
inTrain <- sample(seq_along(x), size = train.percent*n.all)
inTrain

# Subset vectors with train and test data subsets
# Train data
x.train <- x[inTrain]
y.train <- y[inTrain]

# Test data
x.test <- x[-inTrain]
y.test <- y[-inTrain]

# x and y to display given function 'y.func'
x.line <- seq(x.min, x.max, length = n.all)
y.line <- y.func(x.line)

x.line
y.line

### PLOT PRIMARY DATA
#------------------------------------------------------------------------------
# Plot parameters
x.lim <- c(x.min, x.max)
y.lim <- c(min(y), max(y))

# Plot generated train data
plot(x.train, y.train, 
     col = grey(0.2), bg = grey(0.2), pch = 21,
     xlab = 'X', ylab = 'Y', 
     xlim = x.lim, ylim = y.lim, 
     cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)

# Header
mtext('Initial train data and real function', side = 3)

# test values
points(x.test, y.test, col = 'red', bg = 'red', pch = 21)

# true function
lines(x.line, y.line, lwd = 2, lty = 2)

# legend
legend('bottomright', legend = c('train data', 'test data', 'f(X)=4 - 2e-02*x + 5.5e-03*x^2 - 4.9e-05*x^3'),
       pch = c(16, 16, NA), 
       col = c(grey(0.2), 'red', 'black'),  
       lty = c(0, 0, 2), lwd = c(1, 1, 2), cex = 1.2)

#------------------------------------------------------------------------------

### MODEL
# For modeling spline with degree of freedoms from 2 (straight line) to 50 (1/2 of N)
# To demonstrate spline with df = 6
mod <- smooth.spline(x = x.train, y = y.train, df = 6)

# Model data for error estimation
y.model.train <- predict(mod, data.frame(x = x.train))$y[, 1]
y.model.test <- predict(mod, data.frame(x = x.test))$y[, 1]

# Calculate mean squared errors for test and model data
MSE <- c(sum((y.train - y.model.train)^2) / length(x.train),
         sum((y.test - y.model.test)^2) / length(x.test))
names(MSE) <- c('train', 'test')
round(MSE, 2)

# Now let's make splines for all degrees of freedoms (2-50)
# Max degrees of fredom for spline model
max.df <- 50

tbl <- data.frame(df = 2:max.df)   # Create result table for recording calculated MSE 
tbl$MSE.train <- 0                 # Column for train data
tbl$MSE.test <- 0                  # Column for test data

# For all degrees of freedoms
for (i in 2:max.df) {
    # build model
    mod <- smooth.spline(x = x.train, y = y.train, df = i)
    
    # model values from calculaton of errors
    y.model.train <- predict(mod, data.frame(x = x.train))$y[, 1]
    y.model.test <- predict(mod, data.frame(x = x.test))$y[, 1]
    
    # Calculate MSE for predicted and train data sets
    MSE <- c(sum((y.train - y.model.train)^2) / length(x.train),
             sum((y.test - y.model.test)^2) / length(x.test))
    
    # record errors into the table
    tbl[tbl$df == i, c('MSE.train', 'MSE.test')] <- MSE
}

head(tbl)

### PLOT - Dependency of MSEs from model flexibility (degree of freedoms)
#------------------------------------------------------------------------------
plot(x = tbl$df, y = tbl$MSE.test, 
     type = 'l', col = 'red', lwd = 2,
     xlab = 'Degree of freedoms', ylab = 'MSE',
     ylim = c(min(tbl$MSE.train, tbl$MSE.test), 
              max(tbl$MSE.train, tbl$MSE.test)),
     cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)

# Header
mtext('MSE dependency from degree of freedoms', side = 3)

points(x = tbl$df, y = tbl$MSE.test,
       pch = 21, col = 'red', bg = 'red')
lines(x = tbl$df, y = tbl$MSE.train, col = grey(0.3), lwd = 2)
# unrecoverable error
abline(h = res.sd, lty = 2, col = grey(0.4), lwd = 2)

# legend
legend('topright', legend = c('train', 'test'),
       pch = c(NA, 16), 
       col = c(grey(0.2), 'red'),  
       lty = c(1, 1), lwd = c(2, 2), cex = 1.2)

#------------------------------------------------------------------------------

# Detect degree of freedom with minimal error for test data
min.MSE.test <- min(tbl$MSE.test)
df.min.MSE.test <- tbl[tbl$MSE.test == min.MSE.test, 'df']
df.min.MSE.test

# Compromize for flexibility and pricision of model
df.my.MSE.test <- df.min.MSE.test

my.MSE.test <- tbl[tbl$df == df.my.MSE.test, 'MSE.test']
my.MSE.test

# Display optimal df
abline(v = df.my.MSE.test, lty = 2, lwd = 2)
points(x = df.my.MSE.test, y = my.MSE.test, pch = 15, col = 'blue')
mtext(df.my.MSE.test, side = 1, line = -1, at = df.my.MSE.test, col = 'blue', cex = 1.2)

### PLOT OPTIMAL MODEL
mod.MSE.test <- smooth.spline(x = x.train, y = y.train, df = df.my.MSE.test)

# for smoothed curves of model
x.model.plot <- seq(x.min, x.max, length = 250)
y.model.plot <- predict(mod.MSE.test, data.frame(x = x.model.plot))$y[, 1]

x.lim <- c(x.min, x.max)
y.lim <- c(min(y), max(y))

# Plot train data with noise
#------------------------------------------------------------------------------
plot(x.train, y.train, 
     col = grey(0.2), bg = grey(0.2), pch = 21,
     xlab = 'X', ylab = 'Y', 
     xlim = x.lim, ylim = y.lim, 
     cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)

# Header
mtext('Given data and optimal model', side = 3)

# Test data
points(x.test, y.test, col = 'red', bg = 'red', pch = 21)

# True function
lines(x.line, y.line, lwd = 2, lty = 2)

# Model
lines(x.model.plot, y.model.plot, lwd = 2, col = 'blue')

# legend
legend('topleft', legend = c('train', 'test', 'f(X)', 'model'),
       pch = c(16, 16, NA, NA), 
       col = c(grey(0.2), 'red', 'black', 'blue'),  
       lty = c(0, 0, 2, 1), lwd = c(1, 1, 2, 2), cex = 1.2)
#------------------------------------------------------------------------------

23.2 Discret variables

# Estimate model precision for one descrete dependent variable (Y)
# Source: Course 'Math modeling' practical work, State University of Management, Moscow
# link: https://sites.google.com/a/kiber-guu.ru/r-practice/home

library('mlbench')
library('class')
library('car')
library('class')
library('e1071')
library('MASS')

# discrete function
rules <- function(x1, x2){ ifelse((x1 > 20 & x2 < 50) | (x1 < 18 & x2 > 52), 1, 0) }

# parameters
my.seed <- 123
n <- 100
train.percent <- 0.85

# Generate data set
set.seed(my.seed)
x1 <- rnorm(20, 3.7, n = n)
set.seed(my.seed + 1)
x2 <- rnorm(50, 3.3, n = n)

# Vector to subset train and test data
set.seed(my.seed)
inTrain <- sample(seq_along(x1), train.percent*n)

x1.train <- x1[inTrain]
x2.train <- x2[inTrain]

x1.test <- x1[-inTrain]
x2.test <- x2[-inTrain]

# Create descrete parameter according to given function
y.train <- rules(x1.train, x2.train)
y.test <- rules(x1.test, x2.test)
# Table with train data
df.train.1 <- data.frame(x1 = x1.train, x2 = x2.train, y = y.train)
df.train.1
# Table with test data
df.test.1 <- data.frame(x1 = x1.test, x2 = x2.test)
df.test.1

### PLOT TEST DATA
x1.grid <- rep(seq(floor(min(x1)), ceiling(max(x1)), by = 1),
               ceiling(max(x2)) - floor(min(x2)) + 1)
x2.grid <- rep(seq(floor(min(x2)), ceiling(max(x2)), by = 1),
               each = ceiling(max(x1)) - floor(min(x1)) + 1)
# классы для наблюдений сетки
y.grid <- rules(x1.grid, x2.grid)
# фрейм для сетки
df.grid.1 <- data.frame(x1 = x1.grid, x2 = x2.grid, y = y.grid)
# цвета для графиков
cls <- c('blue', 'orange')
cls.t <- c(rgb(0, 0, 1, alpha = 0.5), rgb(1,0.5,0, alpha = 0.5))
# график истинных классов
plot(df.grid.1$x1, df.grid.1$x2, 
     pch = '·', col = cls[df.grid.1[, 'y'] + 1],
     xlab = 'X1', ylab = 'Y1',
     main = 'Обучающая выборка, факт')
# точки фактических наблюдений
points(df.train.1$x1, df.train.1$x2,
       pch = 21, bg = cls.t[df.train.1[, 'y'] + 1], 
       col = cls.t[df.train.1[, 'y'] + 1])

### 1. Build a model using Naive Bayess classifier

nb <- naiveBayes(y ~ ., data = df.train.1)
# получаем модельные значения на обучающей выборке как классы
y.nb.train <- ifelse(predict(nb, df.train.1[, -3], 
                             type = "raw")[, 2] > 0.5, 1, 0)
# график истинных классов
plot(df.grid.1$x1, df.grid.1$x2, 
     pch = '·',  col = cls[df.grid.1[, 'y'] + 1], 
     xlab = 'X1', ylab = 'Y1',
     main = 'Обучающая выборка, модель naiveBayes')
# точки наблюдений, предсказанных по модели
points(df.train.1$x1, df.train.1$x2, 
       pch = 21, bg = cls.t[y.nb.train + 1], 
       col = cls.t[y.nb.train + 1])

# матрица неточностей на обучающей выборке
tbl <- table(y.train, y.nb.train)
tbl

Acc <- sum(diag(tbl)) / sum(tbl)
Acc

cat('Как можно видеть на графике, ту часть жёлтого класса, которая расположена в левой верхней области пространства координат, модель классифицирует неверно. Таким образом, байесовская решающая граница не моделирует разрыв жёлтого класса синим. Это происходит потому, что в непрерывном случае наивный байесовский метод исходит из допущения о линейной разделимости двух классов и нормальности распределения объясняющих переменных в них. Однако в этом примере это допущение не выполняется.
Сделаем прогноз классов Y на тестовую выборку и оценим точность модели. Как можно убедиться, точность на тестовой оказывается ниже, чем на обучающей выборке. Учитывая, как ведёт себя классификатор на обучающей выборке, такой модели доверять не стоит.')

# прогноз на тестовую выборку
y.nb.test <- ifelse(predict(nb, df.test.1, type = "raw")[, 2] > 0.5, 1, 0)
# матрица неточностей на тестовой выборке
tbl <- table(y.test, y.nb.test)
tbl

# точность, или верность (Accuracy)
Acc <- sum(diag(tbl)) / sum(tbl)
Acc

### 2. Build kNN model for k=3
y.knn.train <- knn(train = scale(df.train.1[, -3]), 
                   test = scale(df.train.1[, -3]),
                   cl = df.train.1$y, k = 3)
# график истинных классов
plot(df.grid.1$x1, df.grid.1$x2, 
     pch = '·', col = cls[df.grid.1[, 'y'] + 1],
     xlab = 'X1', ylab = 'Y1',
     main = 'Обучающая выборка, модель kNN')
# точки наблюдений, предсказанных по модели
points(df.train.1$x1, df.train.1$x2, 
       pch = 21, bg = cls.t[as.numeric(y.knn.train)], 
       col = cls.t[as.numeric(y.knn.train)])

# матрица неточностей на обучающей выборке
tbl <- table(y.train, y.knn.train)
tbl

# точность (Accuracy)
Acc <- sum(diag(tbl)) / sum(tbl)
Acc

# прогноз на тестовую выборку
y.knn.test <- knn(train = scale(df.train.1[, -3]), 
                  test = scale(df.test.1[, -3]),
                  cl = df.train.1$y, k = 3)
# матрица неточностей на тестовой выборке
tbl <- table(y.test, y.knn.test)
tbl

# точность (Accuracy)
Acc <- sum(diag(tbl)) / sum(tbl)
Acc

# As we can see, kNN method is more accurate than Naive Bayes for this case.