Chapter 30 Spline model
30.1 Splines
30.2 Area under the curve using spline method
Area under the plasma drug concentration-time curve
In the field of pharmacokinetics, the area under the curve (AUC) is the definite integral in a plot of drug concentration in blood plasma vs. time. In practice, the drug concentration is measured at certain discrete points in time and the trapezoidal rule is used to estimate AUC.
We can find spline curve and calculate area under the curve to improve the result and compair with trapezoidal method. The AUC represents the total drug exposure over time (Wiki)
.
Data: Dependency of concentration (mg/L) from time (h).
Aim: Plot curve and calculate the area under the curve
library(MESS)
<- c(0.5,1,2,4,8,12,18,24,36,48,72)
time <- c(5.36,9.35,17.18,25.78,29.78,26.63,19.4,13.26,5.88,2.56,0.49)
conc
<- smooth.spline(time, conc)
mod
<- predict(mod, x=seq(from=min(time), to=max(time), by=(max(time)-min(time))/((length(time)*10))))
predict.xy
plot(conc ~ time, pch=19, xlab='Time, h', ylab='Concentration, mg/L')
lines(time, conc, col='green')
lines(predict.xy, col='red')
# Calculate area under the curve (auc)
# trapezoidal method
::auc(time, conc, from=min(time), to=max(time), type='linear') MESS
## [1] 721.9925
# spline with given data
::auc(time, conc, from=min(time), to=max(time), type='spline') MESS
## [1] 712.7034
# trapezoidal method with expanded set from predicted spline
::auc(predict.xy[[1]], from=min(time), to=max(time), predict.xy[[2]], type='linear') MESS
## [1] 712.4289
# spline from expanded with spline
::auc(predict.xy[[1]], from=min(time), to=max(time), predict.xy[[2]], type='spline') MESS
## [1] 712.7106
30.3 Set data using given function and predict curve using spline method
Aim: Generate random data with noise \(y = f(x) + \epsilon\) from function \(f(x)=4-0.02x+0.0055x^2-4.9*10^{-5}x^3; \epsilon ~ N(0,1)\)
Make a model using spline method.
<- 1
my.seed <- 100 # amount of elements
n.all
# set x values
set.seed(my.seed)
<-5
x.min <- 105
x.max <- runif(n.all, x.min, x.max) # set rundom x from 5 to 105 with n elements
x
# noise
set.seed(my.seed)
<-1 # noise standard deviation
noise.sd <- rnorm(mean=0, sd=noise.sd, n=n.all)
noise
# subset train data
set.seed(my.seed)
<- 0.85 # percent of train data
train.percent <- sample(seq_along(x), size=(train.percent*n.all))
inTrain
# true function y=f(x)
<- function(x) {4 - 2e-02*x + 5.5e-03*x^2 - 4.9e-05*x^3}
y.func
# for plot of true values
<- seq(x.min, x.max, length=n.all)
x.line <- y.func(x.line)
y.line
# true function with noise
<- y.func(x) + noise
y
# train values
<- x[inTrain]
x.train <- y[inTrain]
y.train
# test values
<- x[-inTrain]
x.test <- y[-inTrain]
y.test
# plot
<- c(x.min, x.max)
x.lim <- c(min(y), max(y))
y.lim
plot(x.train, y.train,
col = grey(0.2), bg = grey(0.2), pch = 21,
xlab = 'X', ylab = 'Y',
xlim = x.lim, ylim = y.lim,
cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)
# test data
points(x.test, y.test, col = 'red', bg = 'red', pch = 21)
# true function
lines(x.line, y.line, lwd = 2, lty = 2)
# legend
legend('topleft', legend = c('train', 'test', 'f(X)'),
pch = c(16, 16, NA),
col = c(grey(0.2), 'red', 'black'),
lty = c(0, 0, 2), lwd = c(1, 1, 2), cex = 1.2)
## Splain model As a model of given data we will use splains with degree of freedom from 2 (line) to 60 (number of knots is equal 2/3 from number of values).
# build spline model with degrees of freedom df=6
<- smooth.spline(x = x.train, y = y.train, df = 6)
mod
# model values for calculation of errors
<- predict(mod, data.frame(x = x.train))$y[, 1]
y.model.train <- predict(mod, data.frame(x = x.test))$y[, 1]
y.model.test
# sum of squared errors
<- c(sum((y.train - y.model.train)^2) / length(x.train),
MSE sum((y.test - y.model.test)^2) / length(x.test))
names(MSE) <- c('train', 'test')
round(MSE, 2)
## train test
## 0.71 1.00
# build models with degree of freedoms from df 2 to 60
# максимальное число степеней свободы для модели сплайна
<- 60
max.df <- data.frame(df = 2:max.df) # таблица для записи ошибок
tbl $MSE.train <- 0 # столбец: ошибки на обучающей выборке
tbl$MSE.test <- 0 # столбец: ошибки на тестовой выборке
tbl
# for each degree of freedom
for (i in 2:max.df) {
<- smooth.spline(x = x.train, y = y.train, df = i)
mod
# model for train and test data
<- predict(mod, data.frame(x = x.train))$y[, 1]
y.model.train <- predict(mod, data.frame(x = x.test))$y[, 1]
y.model.test
# errors for train and learn data
<- c(sum((y.train - y.model.train)^2) / length(x.train),
MSE sum((y.test - y.model.test)^2) / length(x.test))
$df == i, c('MSE.train', 'MSE.test')] <- MSE
tbl[tbl
}
head(tbl)
## df MSE.train MSE.test
## 1 2 3.6484333 3.3336892
## 2 3 1.5185881 1.1532857
## 3 4 0.8999800 0.8874002
## 4 5 0.7477105 0.9483290
## 5 6 0.7127908 1.0038393
## 6 7 0.7000429 1.0300354
# Diagram: dependence of MSE from moedel flexibility
plot(x = tbl$df, y = tbl$MSE.test,
type = 'l', col = 'red', lwd = 2,
xlab = 'Degree of freedom', ylab = 'MSE',
ylim = c(min(tbl$MSE.train, tbl$MSE.test),
max(tbl$MSE.train, tbl$MSE.test)),
cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)
# head of plot
mtext('Dependence of MSE from degree of freedom', side = 3)
points(x = tbl$df, y = tbl$MSE.test, pch = 21, col = 'red', bg = 'red')
lines(x = tbl$df, y = tbl$MSE.train, col = grey(0.3), lwd = 2)
# неустранимая ошибка
abline(h = noise.sd, lty = 2, col = grey(0.4), lwd = 2)
# легенда
legend('topleft', legend = c('обучающая', 'тестовая'),
pch = c(NA, 16),
col = c(grey(0.2), 'red'),
lty = c(1, 1), lwd = c(2, 2), cex = 1.2)
# степени свободы у наименьшей ошибки на тестовой выборке
<- min(tbl$MSE.test)
min.MSE.test <- tbl[tbl$MSE.test == min.MSE.test, 'df']
df.min.MSE.test
# компромисс между точностью и простотой модели по графику
<- 6
df.my.MSE.test <- tbl[tbl$df == df.my.MSE.test, 'MSE.test']
my.MSE.test
# ставим точку на графике
abline(v = df.my.MSE.test, lty = 2, lwd = 2)
points(x = df.my.MSE.test, y = my.MSE.test, pch = 15, col = 'blue')
mtext(df.my.MSE.test, side = 1, line = -1, at = df.my.MSE.test, col = 'blue', cex = 1.2)
# График 3: Лучшая модель (компромисс между гибкостью и точностью) ############
<- smooth.spline(x = x.train, y = y.train, df = df.my.MSE.test)
mod.MSE.test
# для гладких графиков модели
<- seq(x.min, x.max, length = 250)
x.model.plot <- predict(mod.MSE.test, data.frame(x = x.model.plot))$y[, 1]
y.model.plot
# убираем широкие поля рисунка
par(mar = c(4, 4, 1, 1))
# наименьшие/наибольшие значения по осям
<- c(x.min, x.max)
x.lim <- c(min(y), max(y))
y.lim
# наблюдения с шумом (обучающая выборка)
plot(x.train, y.train,
col = grey(0.2), bg = grey(0.2), pch = 21,
xlab = 'X', ylab = 'Y',
xlim = x.lim, ylim = y.lim,
cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)
# head of the plot
mtext('Initial data and the best fit model', side = 3)
# test values
points(x.test, y.test,
col = 'red', bg = 'red', pch = 21)
# true function
lines(x.line, y.line, lwd = 2, lty = 2)
# model
lines(x.model.plot, y.model.plot, lwd = 2, col = 'blue')
# legend
legend('topleft', legend = c('train', 'test', 'f(X)', 'model'),
pch = c(16, 16, NA, NA),
col = c(grey(0.2), 'red', 'black', 'blue'),
lty = c(0, 0, 2, 1), lwd = c(1, 1, 2, 2), cex = 1.2)
In interpolating problems, spline interpolation is often preferred to polynomial interpolation because it yields similar results, even when using low degree polynomials, while avoiding Runge’s phenomenon for higher degrees.
=======
In this example we will generate data from a given function and then build a model using splines and estimate quality of the model.
30.4 Generate dataset from a given function
# parameters to generate a dataset
<- 100 # number of observations
n.all <- 0.85 # portion of the data for training
train.percent <- 1 # standard deviation of noise
res.sd <- 5 # min limit of the data
x.min <- 105 # max limit of the data
x.max
# generate x
set.seed(1) # to get reproducible results by randomizer
<- runif(x.min, x.max, n = n.all)
x
# noise from normal destibution
set.seed(1)
<- rnorm(mean = 0, sd = res.sd, n = n.all)
res
# generate y using a given function
<- function(x) {4 - 2e-02*x + 5.5e-03*x^2 - 4.9e-05*x^3}
y.func
# add noise
<- y.func(x) + res y
30.5 Split data for train and test
# split dataset for training and test
set.seed(1)
# generate vector of chosen x for train data
<- sample(seq_along(x), size = train.percent*n.all)
inTrain
# train data set
<- x[inTrain]
x.train <- y[inTrain]
y.train
# test data set
<- x[-inTrain]
x.test <- y[-inTrain] y.test
30.6 Diagram of the given function and generated datasets
# lines of generated data for plot
<- seq(x.min, x.max, length = n.all)
x.line <- y.func(x.line)
y.line
# PLOT
# generate plot by train data
par(mar = c(4, 4, 1, 1)) # reduce margins (optional)
plot(x.train, y.train,
main = 'Generated data and original function',
col = grey(0.2), bg = grey(0.2), pch = 21,
xlab = 'X', ylab = 'Y',
xlim = c(x.min, x.max),
ylim = c(min(y), max(y)),
cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)
# add points of test data
points(x.test, y.test, col = 'red', bg = 'red', pch = 21)
# add the given function
lines(x.line, y.line, lwd = 2, lty = 2)
# add legend
legend('topleft', legend = c('train', 'test', 'f(X)'),
pch = c(16, 16, NA),
col = c(grey(0.2), 'red', 'black'),
lty = c(0, 0, 2), lwd = c(1, 1, 2), cex = 1.2)
30.7 Build a model using splines
We will compair sevaral models with degree of freedoms (df) from 2 to 40, where 2 correspond to a linear model.
<- 40 # max degree of freedom (df)
max.df #
<- data.frame(df = 2:max.df) # data frame for writing errors
tbl $MSE.train <- 0 # column 1: errors of train data
tbl$MSE.test <- 0 # сcolumn 2: errors of test data
tbl
# generate models using for cycle
for (i in 2:max.df) {
<- smooth.spline(x = x.train, y = y.train, df = i)
mod
# predicted values for train and test data using built model
<- predict(mod, data.frame(x = x.train))$y[, 1]
y.model.train <- predict(mod, data.frame(x = x.test))$y[, 1]
y.model.test
# MSE errors for train and test data
<- c(sum((y.train - y.model.train)^2) / length(x.train),
MSE sum((y.test - y.model.test)^2) / length(x.test))
# write errors to the previously created data frame
$df == i, c('MSE.train', 'MSE.test')] <- MSE
tbl[tbl
}
# view first rows of the table
head(tbl, 4)
## df MSE.train MSE.test
## 1 2 3.6484333 3.3336892
## 2 3 1.5185881 1.1532857
## 3 4 0.8999800 0.8874002
## 4 5 0.7477105 0.9483290
30.8 Diagram of MSE for train and test data
# plot MSE from our table
plot(x = tbl$df, y = tbl$MSE.test,
main = "Changes of MSE from degrees of freedom",
type = 'l', col = 'red', lwd = 2,
xlab = 'spline degree of freedom', ylab = 'MSE',
ylim = c(min(tbl$MSE.train, tbl$MSE.test),
max(tbl$MSE.train, tbl$MSE.test)),
cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)
# add
points(x = tbl$df, y = tbl$MSE.test,
pch = 21, col = 'red', bg = 'red')
lines(x = tbl$df, y = tbl$MSE.train, col = grey(0.3), lwd = 2)
# minimal MSE
abline(h = res.sd, lty = 2, col = grey(0.4), lwd = 2)
# add legend
legend('topright', legend = c('train', 'test'),
pch = c(NA, 16),
col = c(grey(0.2), 'red'),
lty = c(1, 1), lwd = c(2, 2), cex = 1.2)
# df of minimal MSE for test data
<- min(tbl$MSE.test)
min.MSE.test <- tbl[tbl$MSE.test == min.MSE.test, 'df']
df.min.MSE.test
# optimal df for precise model and maximal simplicity
<- 6
df.my.MSE.test <- tbl[tbl$df == df.my.MSE.test, 'MSE.test']
my.MSE.test
# show the optimal solution
abline(v = df.my.MSE.test,
lty = 2, lwd = 2)
points(x = df.my.MSE.test, y = my.MSE.test,
pch = 15, col = 'blue')
mtext(df.my.MSE.test,
side = 1, line = -1, at = df.my.MSE.test, col = 'blue', cex = 1.2)
30.9 Build optimal model and plot for the model
<- smooth.spline(x = x.train, y = y.train, df = df.my.MSE.test)
mod.MSE.test
# predict data for 250 x's to get smoothed curve
<- seq(x.min, x.max, length = 250)
x.model.plot <- predict(mod.MSE.test, data.frame(x = x.model.plot))$y[, 1]
y.model.plot
# plot train data
par(mar = c(4, 4, 1, 1))
plot(x.train, y.train,
main = "Initial data and the best fit model",
col = grey(0.2), bg = grey(0.2), pch = 21,
xlab = 'X', ylab = 'Y',
xlim = c(x.min, x.max),
ylim = c(min(y), max(y)),
cex = 1.2, cex.lab = 1.2, cex.axis = 1.2)
# add test data
points(x.test, y.test, col = 'red', bg = 'red', pch = 21)
# function
lines(x.line, y.line,lwd = 2, lty = 2)
# add model
lines(x.model.plot, y.model.plot, lwd = 2, col = 'blue')
# legend
legend('topleft', legend = c('train', 'test', 'f(X)', 'model'),
pch = c(16, 16, NA, NA),
col = c(grey(0.2), 'red', 'black', 'blue'),
lty = c(0, 0, 2, 1), lwd = c(1, 1, 2, 2), cex = 1.2)