Chapter 29 Multiple linear regression

Source: Анализ данных в R. Множественная линейная регрессия

# Dataset swiss
?swiss
swiss <- data.frame(swiss)
str(swiss)

# Histogram of fertility
hist(swiss$Fertility, col='red')

# Numeric predictors for Fertility prediction
fit <- lm(Fertility ~ Examination + Catholic, data = swiss)
summary(fit)
# the principal predictor is an 'examination' with negative correlation.

# Interaction of variables 'examination' and 'catholics' '*'
fit2 <- lm(Fertility ~ Examination*Catholic, data = swiss)
summary(fit2)

confint(fit2)

# Categorical predictors
# Histogram obviously have two parts -> we can split data for two factors
hist(swiss$Catholic, col = 'red')

# Lets split 'Catholics' for two groups: with many 'lots' and few 'few'
swiss$religious <- ifelse(swiss$Catholic > 60, 'Lots', 'Few')
swiss$religious <- as.factor(swiss$religious)

fit3 <- lm(Fertility ~ Examination + religious, data = swiss)
summary(fit3)

# Interaction of variables
fit4 <- lm(Fertility ~ religious*Examination, data = swiss)
summary(fit4)

# plots

ggplot(swiss, aes(x = Examination, y = Fertility)) + 
  geom_point() 

ggplot(swiss, aes(x = Examination, y = Fertility)) + 
  geom_point() + 
  geom_smooth()

ggplot(swiss, aes(x = Examination, y = Fertility)) + 
  geom_point() + 
  geom_smooth(method = 'lm')

ggplot(swiss, aes(x = Examination, y = Fertility, col = religious)) + 
  geom_point() 

ggplot(swiss, aes(x = Examination, y = Fertility, col = religious)) + 
  geom_point()  + 
  geom_smooth()

ggplot(swiss, aes(x = Examination, y = Fertility, col = religious)) + 
  geom_point()  + 
  geom_smooth(method = 'lm')


#

fit5 <- lm(Fertility ~ religious*Infant.Mortality*Examination, data = swiss)
summary(fit5)


# model comparison

rm(swiss)
swiss <- data.frame(swiss)

fit_full <- lm(Fertility ~ ., data = swiss)
summary(fit_full)

fit_reduced1 <- lm(Fertility ~ Infant.Mortality + Examination + Catholic + Education, data = swiss)
summary(fit_reduced1)

anova(fit_full, fit_reduced1)

fit_reduced2 <- lm(Fertility ~ Infant.Mortality + Education + Catholic + Agriculture, data = swiss)
summary(fit_reduced2)

anova(fit_full, fit_reduced2)

# model selection

optimal_fit <-  step(fit_full, direction = 'backward')
summary(optimal_fit)