template code and tricks for performing simple and multiple linear regression in R.
library(tidyverse)
library(ggfortify) # plot lm objects using ggplot instead of base R
library(car) # needed for added-variable plots and dfbetas and dffits
library(corrplot)
library(bestglm) # for stepwise methods
library(glmnet) # for ridge, lasso, and elastic net
#to install:
install.packages("package")
#general:
data <- read.table('path.extension', sep=' ', header=True)
#csv specific:
data <- read.csv('path.extention', sep = ' ')
head(data, n=?) # first n rows of data
summary(data) # mean, etc. of each field
#see also scatterplot
#Correlation:
round(cor(data), digits=?) # numerical matrix of values
cor(data$y, data$x) # numerical single value
corrplot(cor(data), type="upper”) # color/shape-coded matrix
#Simple Linear Regression:
data.base.plot <- ggplot(data = data, mapping = aes(x = x, y = y)) +
geom_point() +
theme_bw() +
theme(aspect.ratio = 1) +
geom_smooth(method = "lm", se = FALSE) # add line-of-best-fit
#Multiple Linear Regression:
pairs(data, pch=19)
#or
plot(data, pch=19))
#pch=19 changes circles to dots in matrix
ggplot(data=data, mapping=aes(y=residuals)) +
geom_boxplot() +
theme(aspect.ratio=1)
ggplot(data = data, mapping = aes(x = residuals)) +
geom_histogram(mapping = aes(y = ..density..), binwidth = ?) +
stat_function(fun = dnorm, color = "red", size = 2,
args = list(mean = mean(data$residuals), sd = sd(data$residuals)))
ggplot(data) +
geom_line(mapping = aes(x = 1:nrow(data), y = residuals)) +
theme_bw() +
xlab("Order in Data Set") +
theme(aspect.ratio = 1)
#Note: data must be in some sequential order for this to be appropriate
#Simple linear regression
data.lm <- lm(y ~ x, data=data)
#Multiple linear regression
data.lm <- lm(y ~ ., data=data) #!make sure data.frame is properly subsetted!
#Saving residuals and fitted.values
data$residuals <- data.lm$residuals
data$fitted.values <- data.lm$fitted.values
bc <- boxCox(data.lm)
(Lambda <- bc$x[which.max(bc$y)])
lambda = 1 # lambda = 1 means that no transformation occurs. Lambda = 0 is invalid
#transform data:
data$t_y <- fires$y^lambda
data$t_x <- fires$x^lambda
# also consider sqrt(), log(), etc.
data.t.lm <- lm(t_y ~ t_x, data)
data$t_residuals <- data.t.lm$residuals
data$t_fitted.values <- data.t.lm$fitted.values
summary(data.t.lm)
autoplot(data.lm, which = 1, ncol = 1, nrow = 1) + theme(aspect.ratio=1)
#Simple Linear Regression
ggplot(data=data, mapping=aes(x=predictor, y=residuals)) +
geom_point() +
theme(aspect.ratio=1)
#Multiple Linear Regression # see also miscellaneous/looping in ggplot2
predictors = names(data)[-1] # make sure this only contains predictors
for (predictor in predictors) {
plot <- ggplot(data=data, mapping=aes(x=!!sym(predictor), y=residuals)) +
geom_point() +
theme(aspect.ratio=1)
print(plot)
}
autoplot(data.lm, which = 2, ncol = 1, nrow = 1) + theme(aspect.ratio=1)
avPlots(data.lm)
#Simple Linear Regression:
data.dfbetas <- as.data.frame(dfbetas(data.lm))
data.dfbetas$obs <- 1:nrow(data)
#plot:
ggplot(data = data.dfbetas) +
geom_point(mapping = aes(x = obs, y = abs(x))) +
ylab("Absolute Value of DFBETAS for X") +
xlab("Observation Number") +
# geom_hline(mapping = aes(yintercept = 1),
# color = "red", linetype = "dashed") + # for n <= 30
geom_hline(mapping = aes(yintercept = 2 / sqrt(length(obs))),
color = "red", linetype = "dashed") + # for n > 30
theme_bw() +
theme(aspect.ratio = 1, plot.title = element_blank())
#print:
#for n <= 30:
x.extreme.dfbetas <- data.dfbetas[abs(data.dfbetas$x) > 1,]
x.extreme.dfbetas[order(x.extreme.dfbetas$x),]
#for n > 30:
x.extreme.dfbetas <- data.dfbetas[abs(data.dfbetas$x) > 2 / sqrt(length(data.dfbetas$obs)),]
x.extreme.dfbetas[order(x.extreme.dfbetas$x),]
#Multiple Linear Regression:
predictors = names(data)[-1] # make sure this only contains predictors
for (predictor in predictors) {
plot <- ggplot(data = data.dfbetas) +
geom_point(mapping = aes(x = obs, y = abs(!!sym(predictor)))) +
ylab(paste("Absolute Value of DFBETAS for ", predictor, sep=''))
xlab("Observation Number") +
# geom_hline(mapping = aes(yintercept = 1),
# color = "red", linetype = "dashed") + # for n <= 30
geom_hline(mapping = aes(yintercept = 2 / sqrt(length(obs))),
color = "red", linetype = "dashed") + # for n > 30
theme_bw() +
theme(aspect.ratio = 1, plot.title = element_blank())
print(plot)
# for n <= 30:
# print(data.dfbetas[abs(data.dfbetas[predictor]) > 1, c(predictor, 'obs')])
# for n > 30:
print(data.dfbetas[abs(data.dfbetas[predictor]) > 2 / sqrt(length(data.dfbetas$obs)), c(predictor, 'obs')])
}
data.dffits <- data.frame("dffits" = dffits(data.lm))
data.dffits$obs <- 1:nrow(data)
#plot:
ggplot(data = data.dffits) +
geom_point(mapping = aes(x = obs, y = abs(dffits))) +
ylab("Absolute Value of DFFITS for Y") +
xlab("Observation Number") +
# geom_hline(mapping = aes(yintercept = 1),
# color = "red", linetype = "dashed") + # for n <= 30
geom_hline(mapping = aes(yintercept = 2 * sqrt(length(data.lm$coefficients) / length(obs))),
color = "red", linetype = "dashed") + # for n > 30
theme_bw() +
theme(aspect.ratio = 1, plot.title = element_blank())
#print:
# for n <= 30
# data.dffits[abs(data.dffits$dffits) > 1,]
# for n > 30
data.dffits[abs(data.dffits$dffits) > 2 * sqrt(length(data.lm$coefficients) / length(data.dffits$obs)),]
shapiro.test(data$residuals)
grp <- as.factor(c(rep("lower", floor(dim(data)[1] / 2)),
rep("upper", ceiling(dim(data)[1] / 2))))
leveneTest(data[order(data$x), "residuals"] ~ grp, center = median)
data$cooksd <- cooks.distance(data.lm)
data[data$cooksd >= 4 / length(data$cooksd), ]
(data.vif <- vif(data.lm))
max(data.vif)
mean(data.vif)
best.subsets.bic <- bestglm(data,
IC = "BIC",
method = "exhaustive",
TopModels = 10)
# create a data frame with the number of variables and the BIC
best.subsets.bic.df <- data.frame("num.vars" = 1:dim(data)[2],
"BIC" = best.subsets.bic$Subsets$BIC)
# plot the BIC values against the number of variables
ggplot(data = best.subsets.bic.df, mapping = aes(x = num.vars, y = BIC)) +
geom_point(size = 3) +
geom_line() +
geom_point(x = which.min(best.subsets.bic.df$BIC),
y = min(best.subsets.bic.df$BIC),
color = "red",
size = 3) +
theme_bw() +
theme(axis.title.x = element_text(size = sz),
axis.title.y = element_text(size = sz),
axis.text.x = element_text(size = sz),
axis.text.y = element_text(size = sz),
aspect.ratio = 1)
# identify the best model according to the BIC
summary(best.subsets.bic$BestModel)
forward.selection.bic <- bestglm(data,
IC = "AIC",
method = "forward",
TopModels = 10)
# create a data frame with the number of variables and the AIC
forward.selection.aic.df <- data.frame("num.vars" = 1:dim(data)[2],
"AIC" = forward.selection.aic$Subsets$AIC)
# plot the BIC values against the number of variables
ggplot(data = forward.selection.aic.df, mapping = aes(x = num.vars, y = AIC)) +
geom_point(size = 3) +
geom_line() +
geom_point(x = which.min(forward.selection.aic.df$AIC),
y = min(forward.selection.aic.df$AIC),
color = "red",
size = 3) +
theme_bw() +
theme(axis.title.x = element_text(size = sz),
axis.title.y = element_text(size = sz),
axis.text.x = element_text(size = sz),
axis.text.y = element_text(size = sz),
aspect.ratio = 1)
# identify the best model according to the AIC
summary(forward.selection.aic$BestModel)
backward.selection.psme <- bestglm(data,
IC = "CV",
method = "backward",
TopModels = 10)
#to get anova output:
anova <- aov(data.lm)
mse <- summary(anova)[[1]][2,2] / summary(anova)[[1]][2,1]
print(paste("Mean Square Error: ", toString(mse), split = ''))
rmse <- sqrt(mse)
print(paste("Root Mean Square Error: ", toString(rmse), split = ''))
mae <- sum(abs(data$y - data$fitted.values)) / (length(data$y) - 2)
print(paste("Mean Absolute Error: ", toString(mae), split = ''))
r_squared <- summary(anova)[[1]][1, 2] / (summary(anova)[[1]][1, 2] + summary(anova)[[1]][2, 2])
print(paste("R-Squared: ", toString(r_squared), split = ''))
adj_r_squared <- summary(data.lm)$adj.r.squared
print(paste("Adjusted R-Squared: ", toString(adj_r_squared), split = ''))
f_statistic <- summary(data.lm)$fstatistic
print(paste("F-Statistic: ", toString(f_statistic), split = ''))
#to get new x values:
new.x <- seq(min(data$x), max(data$x), by=0.05)
confint(data.lm, parm=“x")
(t.stat <- (data.lm$coefficients[2] - 0) / summary(data.lm)$coefficients[2, 2])
pt(t.stat, df = nrow(data) - 2, lower.tail=TRUE) * 2
conf.int.mean <- predict(data.lm,
newdata = data.frame(x = new.x),
interval = "confidence",
level = .95)
ci.data <- as.data.frame(cbind(new.x, conf.int.mean))
(data.plot <- data.plot +
geom_line(data = ci.data, mapping = aes(x = new.x, y = lwr),
color = "#d95f02", size = 1) +
geom_line(data = ci.data, mapping = aes(x = new.x, y = upr),
color = "#d95f02", size = 1) +
geom_smooth(method='lm', se=FALSE))
pred.int.mean <- predict(data.lm,
newdata = data.frame(x = new.x),
interval = "prediction",
level = .95)
pi.data <- as.data.frame(cbind(new.x, pred.int.mean))
data.plot +
geom_line(data = pi.data, mapping = aes(x = new.x, y = lwr), color = "#1b9e77", size = 1) +
geom_line(data = pi.data, mapping = aes(x = new.x, y = upr), color = "#1b9e77", size = 1)
# average values of y based on vector of x values
new.y.preds <- (data.lm$coefficients[1] +
data.lm$coefficients[2] * new.x)^(1/lambda) # undo transformation here
preds <- data.frame(new.x, new.y.preds)
# average values of y for 95% confidence interval, untransformed
conf.int.mean <- predict(data.lm,
newdata = data.frame(x = new.x),
interval = "confidence",
level = 0.95)^(1/lambda)
# new value of y for 95% prediction interval, untransformed
pred.int.mean <- predict(data.lm,
newdata = data.frame(x = new.x),
interval = "prediction",
level = 0.95)^(1/lambda)
ggplot() +
geom_point(data = data, mapping = aes(x = x, y = y)) +
geom_line(data = preds,
aes(x = new.x, y = new.y.preds),
size = 1.5, color ="blue") +
geom_line(mapping = aes(x = new.x, y = conf.int.mean[, 2]),
color = "#d95f02", size = 1.5) +
geom_line(mapping = aes(x = new.x, y = conf.int.mean[, 3]),
color = "#d95f02", size = 1.5) +
geom_line(mapping = aes(x = new.x, y = pred.int.mean[, 2]),
color = "#1b9e77", size = 1.5) +
geom_line(mapping = aes(x = new.x, y = pred.int.mean[, 3]),
color = "#1b9e77", size = 1.5) +
theme(aspect.ratio = 1)
#subsetting rows:
data[data$var == value,]
#conditionals include ==, <=, <, >=, >
#subsetting columns:
data[columns]
#getting names of data, summary stats, output:
names(data)
#outputting and saving simultaneously:
(variable <- value)
#printing:
print(paste(“String: “, toString(value), split =''))
#looping in ggplot2:
for (name in variable_names) {
plot <- ggplot(data=data, mapping=aes(x=!!sym(name), y=residuals)) +
geom_point() + # for example
theme(aspect.ratio=1)
print(plot)
}
#Markdown: command + option + i
```{r}
```
# assignment: option + -
<-
#y limits:
y_interval <- ?
y_lower_limit <- floor(min(data$y)/10)*10
y_upper_limit <- ceiling(max(data$y)/10)*10
#x limits:
x_interval <- ?
x_lower_limit <- floor(min(data[predictor])/10)*10
x_upper_limit <- ceiling(max(data[predictor])/10)*10
ggplot() +
xlab(“X Label”) +
ylab(“Y Label”) +
scale_x_continuous(limits = c(x_lower_limit, x_upper_limit),
breaks = seq(x_lower_limit, x_upper_limit, by = x_interval),
minor_breaks = seq(x_lower_limit, x_upper_limit, by = x_interval)) +
scale_y_continuous(limits = c(y_lower_limit, y_upper_limit),
breaks = seq(y_lower_limit, y_upper_limit, by = y_interval),
minor_breaks = seq(y_lower_limit, y_upper_limit, by = y_interval))
theme(aspect.ratio = 1)
#Note: code is generally good for 0 < x_i < 100, but limit scaling depends on range of x.
#General/Theoretical Model:
$\text{y}_i = \beta_0 + \beta_1\times\text{x}_i + \epsilon_i \space\text{where} \space\epsilon_i \stackrel{iid}{\sim} \text{N}(\mu,\,\sigma^{2})$
#Fitted Model:
$\widehat{\text{y}_i} = \beta_0 + \beta_1\times\text{x}_i$
#Transforming Data:
$\log{}$
$\sqrt{}$