Linear Regression Cheat Sheet

template code and tricks for performing simple and multiple linear regression in R.

print this page

Basic:

Packages:

    
library(tidyverse)
library(ggfortify)  # plot lm objects using ggplot instead of base R
library(car)  # needed for added-variable plots and dfbetas and dffits
library(corrplot)
library(bestglm)  # for stepwise methods
library(glmnet)  # for ridge, lasso, and elastic net

#to install:
install.packages("package")
        

Reading in data:


#general:
data <- read.table('path.extension', sep=' ', header=True)

#csv specific:
data <- read.csv('path.extention', sep = ' ')
        

Exploratory:


head(data, n=?)  # first n rows of data
summary(data) # mean, etc. of each field
#see also scatterplot
        

Correlation:


#Correlation:
round(cor(data), digits=?)  # numerical matrix of values

cor(data$y, data$x)  # numerical single value

corrplot(cor(data), type="upper”)  # color/shape-coded matrix
        

Plotting:

Scatterplot:


#Simple Linear Regression:
data.base.plot <- ggplot(data = data, mapping = aes(x = x, y = y)) +
    geom_point() +
    theme_bw() +
    theme(aspect.ratio = 1) +
    geom_smooth(method = "lm", se = FALSE) # add line-of-best-fit

#Multiple Linear Regression:
pairs(data, pch=19)

#or

plot(data, pch=19))

#pch=19 changes circles to dots in matrix
        

Boxplot:


ggplot(data=data, mapping=aes(y=residuals)) +
    geom_boxplot() +
    theme(aspect.ratio=1)
        

Histogram:


ggplot(data = data, mapping = aes(x = residuals)) +
    geom_histogram(mapping = aes(y = ..density..), binwidth = ?) +
    stat_function(fun = dnorm, color = "red", size = 2,
                args = list(mean = mean(data$residuals), sd = sd(data$residuals)))
        

Sequence Plot:


ggplot(data) +
    geom_line(mapping = aes(x = 1:nrow(data), y = residuals)) +
    theme_bw() +
    xlab("Order in Data Set") +
    theme(aspect.ratio = 1)

#Note: data must be in some sequential order for this to be appropriate
        

Fitting:

Linear Model:


#Simple linear regression
data.lm <- lm(y ~ x, data=data)

#Multiple linear regression
data.lm <- lm(y ~ ., data=data) #!make sure data.frame is properly subsetted!

#Saving residuals and fitted.values
data$residuals <- data.lm$residuals
data$fitted.values <- data.lm$fitted.values
        

Transforming:

Box-Cox Test:


bc <- boxCox(data.lm)
(Lambda <- bc$x[which.max(bc$y)])
        

Transformed Linear Model:


lambda = 1  # lambda = 1 means that no transformation occurs. Lambda = 0 is invalid

#transform data:
data$t_y <- fires$y^lambda
data$t_x <- fires$x^lambda
# also consider sqrt(), log(), etc.

data.t.lm <- lm(t_y ~ t_x, data)
data$t_residuals <- data.t.lm$residuals
data$t_fitted.values <- data.t.lm$fitted.values
summary(data.t.lm)
        

Graphical Diagnostics:

Residuals vs. Fitted Values Plot:


autoplot(data.lm, which = 1, ncol = 1, nrow = 1) + theme(aspect.ratio=1)
        

Residuals vs. Predictor Plot:


#Simple Linear Regression
ggplot(data=data, mapping=aes(x=predictor, y=residuals)) +
    geom_point() +
    theme(aspect.ratio=1)

#Multiple Linear Regression  # see also miscellaneous/looping in ggplot2
predictors = names(data)[-1]  # make sure this only contains predictors
for (predictor in predictors) {
    plot <- ggplot(data=data, mapping=aes(x=!!sym(predictor), y=residuals)) +
        geom_point() +
        theme(aspect.ratio=1)
    
    print(plot)
}
        

Normal Probability Plot:


autoplot(data.lm, which = 2, ncol = 1, nrow = 1) + theme(aspect.ratio=1)
        

Added Variable/Partial Regression Plots


avPlots(data.lm)
        

DFBETAS:


#Simple Linear Regression:
data.dfbetas <- as.data.frame(dfbetas(data.lm))
data.dfbetas$obs <- 1:nrow(data)

#plot:
ggplot(data = data.dfbetas) +
    geom_point(mapping = aes(x = obs, y = abs(x))) +
    ylab("Absolute Value of DFBETAS for X") +
    xlab("Observation Number") +
    # geom_hline(mapping = aes(yintercept = 1),
    #            color = "red", linetype = "dashed") +  # for n <= 30
    geom_hline(mapping = aes(yintercept = 2 / sqrt(length(obs))),
             color = "red", linetype = "dashed") +  # for n > 30
    theme_bw() +
    theme(aspect.ratio = 1, plot.title = element_blank())

#print:
#for n <= 30:
x.extreme.dfbetas <- data.dfbetas[abs(data.dfbetas$x) > 1,]
x.extreme.dfbetas[order(x.extreme.dfbetas$x),]

#for n > 30:
x.extreme.dfbetas <- data.dfbetas[abs(data.dfbetas$x) > 2 / sqrt(length(data.dfbetas$obs)),]
x.extreme.dfbetas[order(x.extreme.dfbetas$x),]


#Multiple Linear Regression:
predictors = names(data)[-1]  # make sure this only contains predictors

for (predictor in predictors) {
    plot <- ggplot(data = data.dfbetas) +
        geom_point(mapping = aes(x = obs, y = abs(!!sym(predictor)))) +
        ylab(paste("Absolute Value of DFBETAS for ", predictor, sep=''))
        xlab("Observation Number") +
        # geom_hline(mapping = aes(yintercept = 1),
        #           color = "red", linetype = "dashed") +  # for n <= 30
        geom_hline(mapping = aes(yintercept = 2 / sqrt(length(obs))),
                    color = "red", linetype = "dashed") +  # for n > 30
        theme_bw() +
        theme(aspect.ratio = 1, plot.title = element_blank())
  
    print(plot)

    # for n <= 30:
    # print(data.dfbetas[abs(data.dfbetas[predictor]) > 1, c(predictor, 'obs')])

    # for n > 30:
    print(data.dfbetas[abs(data.dfbetas[predictor]) > 2 / sqrt(length(data.dfbetas$obs)), c(predictor, 'obs')])
}
        

DFFITS:


data.dffits <- data.frame("dffits" = dffits(data.lm))
data.dffits$obs <- 1:nrow(data)

#plot:
ggplot(data = data.dffits) +
    geom_point(mapping = aes(x = obs, y = abs(dffits))) +
    ylab("Absolute Value of DFFITS for Y") +
    xlab("Observation Number") +
    # geom_hline(mapping = aes(yintercept = 1),
    #            color = "red", linetype = "dashed") +  # for n <= 30
    geom_hline(mapping = aes(yintercept = 2 * sqrt(length(data.lm$coefficients) / length(obs))),
                color = "red", linetype = "dashed") +  # for n > 30
    theme_bw() +
    theme(aspect.ratio = 1, plot.title = element_blank())

    #print:
    # for n <= 30
    # data.dffits[abs(data.dffits$dffits) > 1,]

    # for n > 30
    data.dffits[abs(data.dffits$dffits) > 2 * sqrt(length(data.lm$coefficients) / length(data.dffits$obs)),]
        

Numerical Diagnostics:

Shapiro-Wilk Test:


shapiro.test(data$residuals)
        

Brown-Forsythe or Levene Test:


grp <- as.factor(c(rep("lower", floor(dim(data)[1] / 2)),
                   rep("upper", ceiling(dim(data)[1] / 2))))
leveneTest(data[order(data$x), "residuals"] ~ grp, center = median)
        

Cook’s Distance:


data$cooksd <- cooks.distance(data.lm)
data[data$cooksd >= 4 / length(data$cooksd), ]
        

Variance Inflation Factors:


(data.vif <- vif(data.lm))
max(data.vif)
mean(data.vif)
        

Variable Selection Procedures

Best Subsets (with BIC)


best.subsets.bic <- bestglm(data,
    IC = "BIC",
    method = "exhaustive",
    TopModels = 10)
    
# create a data frame with the number of variables and the BIC
best.subsets.bic.df <- data.frame("num.vars" = 1:dim(data)[2],
                                  "BIC" = best.subsets.bic$Subsets$BIC)

# plot the BIC values against the number of variables
ggplot(data = best.subsets.bic.df, mapping = aes(x = num.vars, y = BIC)) +
  geom_point(size = 3) +
  geom_line() +
  geom_point(x = which.min(best.subsets.bic.df$BIC),
             y = min(best.subsets.bic.df$BIC),
             color = "red",
             size = 3) +
  theme_bw() +
  theme(axis.title.x = element_text(size = sz),
        axis.title.y = element_text(size = sz),
        axis.text.x = element_text(size = sz),
        axis.text.y = element_text(size = sz),
        aspect.ratio = 1)

# identify the best model according to the BIC
summary(best.subsets.bic$BestModel)
        

Forward Selection (with AIC)


forward.selection.bic <- bestglm(data,
            IC = "AIC",
            method = "forward",
            TopModels = 10)
            
# create a data frame with the number of variables and the AIC
forward.selection.aic.df <- data.frame("num.vars" = 1:dim(data)[2],
                                  "AIC" = forward.selection.aic$Subsets$AIC)

# plot the BIC values against the number of variables
ggplot(data = forward.selection.aic.df, mapping = aes(x = num.vars, y = AIC)) +
  geom_point(size = 3) +
  geom_line() +
  geom_point(x = which.min(forward.selection.aic.df$AIC),
             y = min(forward.selection.aic.df$AIC),
             color = "red",
             size = 3) +
  theme_bw() +
  theme(axis.title.x = element_text(size = sz),
        axis.title.y = element_text(size = sz),
        axis.text.x = element_text(size = sz),
        axis.text.y = element_text(size = sz),
        aspect.ratio = 1)

# identify the best model according to the AIC
summary(forward.selection.aic$BestModel)
        

Backward Selection (with PMSE)


backward.selection.psme <- bestglm(data,
        IC = "CV",
        method = "backward",
        TopModels = 10)
        

Evaluation Metrics:


#to get anova output:
anova <- aov(data.lm)
        

Mean Square Error:


mse <- summary(anova)[[1]][2,2] / summary(anova)[[1]][2,1]
print(paste("Mean Square Error: ", toString(mse), split = ''))
        

Root Mean Square Error:


rmse <- sqrt(mse)
print(paste("Root Mean Square Error: ", toString(rmse), split = ''))
        

Mean Absolute Error:


mae <- sum(abs(data$y - data$fitted.values)) / (length(data$y) - 2)
print(paste("Mean Absolute Error: ", toString(mae), split = ''))
        

R-Squared:


r_squared <- summary(anova)[[1]][1, 2] / (summary(anova)[[1]][1, 2] + summary(anova)[[1]][2, 2])
print(paste("R-Squared: ", toString(r_squared), split = ''))
        

Adjusted R-Squared:


adj_r_squared <- summary(data.lm)$adj.r.squared
print(paste("Adjusted R-Squared: ", toString(adj_r_squared), split = ''))
    

F-Statistic


f_statistic <- summary(data.lm)$fstatistic
print(paste("F-Statistic: ", toString(f_statistic), split = ''))
        

Confidence and Predictor Intervals:


#to get new x values:
new.x <- seq(min(data$x), max(data$x), by=0.05)
    

95% Confidence Interval for Slope:


confint(data.lm, parm=“x")
        

Hypothesis Test for Slope:


(t.stat <- (data.lm$coefficients[2] - 0) / summary(data.lm)$coefficients[2, 2])
pt(t.stat, df = nrow(data) - 2, lower.tail=TRUE) * 2
    

Confidence Band for Average of Y:


conf.int.mean <- predict(data.lm,
                         newdata = data.frame(x = new.x),
                         interval = "confidence",
                         level = .95)
ci.data <- as.data.frame(cbind(new.x, conf.int.mean))


(data.plot <- data.plot +
    geom_line(data = ci.data, mapping = aes(x = new.x, y = lwr),
        color = "#d95f02", size = 1) +
    geom_line(data = ci.data, mapping = aes(x = new.x, y = upr),
        color = "#d95f02", size = 1) +
    geom_smooth(method='lm', se=FALSE))
        

Prediction Interval for Y:


pred.int.mean <- predict(data.lm,
                         newdata = data.frame(x = new.x),
                         interval = "prediction",
                         level = .95)
pi.data <- as.data.frame(cbind(new.x, pred.int.mean))

data.plot +
    geom_line(data = pi.data, mapping = aes(x = new.x, y = lwr), color = "#1b9e77", size = 1) +
    geom_line(data = pi.data, mapping = aes(x = new.x, y = upr), color = "#1b9e77", size = 1)
        

Untransform plots:


# average values of y based on vector of x values
new.y.preds <- (data.lm$coefficients[1] +
  data.lm$coefficients[2] * new.x)^(1/lambda)  # undo transformation here
preds <- data.frame(new.x, new.y.preds)

# average values of y for 95% confidence interval, untransformed
conf.int.mean <- predict(data.lm,
                              newdata = data.frame(x = new.x),
                              interval = "confidence",
                              level = 0.95)^(1/lambda)

# new value of y for 95% prediction interval, untransformed
pred.int.mean <- predict(data.lm,
                              newdata = data.frame(x = new.x),
                              interval = "prediction",
                              level = 0.95)^(1/lambda)

ggplot() +
    geom_point(data = data, mapping = aes(x = x, y = y)) +
    geom_line(data = preds,
            aes(x = new.x, y = new.y.preds),
            size = 1.5, color ="blue") +
    geom_line(mapping = aes(x = new.x, y = conf.int.mean[, 2]),
            color = "#d95f02", size = 1.5) +
    geom_line(mapping = aes(x = new.x, y = conf.int.mean[, 3]),
            color = "#d95f02", size = 1.5) +
    geom_line(mapping = aes(x = new.x, y = pred.int.mean[, 2]),
            color = "#1b9e77", size = 1.5) +
    geom_line(mapping = aes(x = new.x, y = pred.int.mean[, 3]),
            color = "#1b9e77", size = 1.5) +
    theme(aspect.ratio = 1)
        

Miscellaneous:

Tips/Tricks:


#subsetting rows:
data[data$var == value,]
    #conditionals include ==, <=, <, >=, >

#subsetting columns:
data[columns]

#getting names of data, summary stats, output:
names(data)

#outputting and saving simultaneously:
(variable <- value)

#printing:
print(paste(“String: “, toString(value), split =''))

#looping in ggplot2:
for (name in variable_names) {
    plot <- ggplot(data=data, mapping=aes(x=!!sym(name), y=residuals)) +
        geom_point() +  # for example
        theme(aspect.ratio=1)

    print(plot)
}
        

Keyboard-shortcuts for Mac:


#Markdown: command + option + i

```{r}
```

# assignment: option + -

 <-
        

Scaling and Labelling in ggplot2:


#y limits:
y_interval <- ?
y_lower_limit <- floor(min(data$y)/10)*10
y_upper_limit <- ceiling(max(data$y)/10)*10

#x limits:
x_interval <- ?
x_lower_limit <- floor(min(data[predictor])/10)*10
x_upper_limit <- ceiling(max(data[predictor])/10)*10

ggplot() +
    xlab(“X Label”) +
    ylab(“Y Label”) +
    scale_x_continuous(limits = c(x_lower_limit, x_upper_limit),
                        breaks = seq(x_lower_limit, x_upper_limit, by = x_interval),
                        minor_breaks = seq(x_lower_limit, x_upper_limit, by = x_interval)) +
    scale_y_continuous(limits = c(y_lower_limit, y_upper_limit),
                        breaks = seq(y_lower_limit, y_upper_limit, by = y_interval),
                        minor_breaks = seq(y_lower_limit, y_upper_limit, by = y_interval))
    theme(aspect.ratio = 1)
    
#Note: code is generally good for 0 < x_i < 100, but limit scaling depends on range of x.
        

LaTeX:


#General/Theoretical Model:
$\text{y}_i = \beta_0 + \beta_1\times\text{x}_i + \epsilon_i \space\text{where} \space\epsilon_i  \stackrel{iid}{\sim} \text{N}(\mu,\,\sigma^{2})$

#Fitted Model:
$\widehat{\text{y}_i} = \beta_0 + \beta_1\times\text{x}_i$

#Transforming Data:
$\log{}$
$\sqrt{}$