Today we are going to combine our skills with regression and programming. But first let's learn more about the R regreession object

rm(list=ls())
library(ggplot2)
library(dplyr)

#nice scatterplots
if(!require(GGally)){
  install.packages("GGally")
  library(GGally)
}

#nice LaTeX output and screen output
if(!require(texreg)){
  install.packages("texreg")
  library(texreg)
}

Tried and true -- looking out our data.

data(swiss)
summary(swiss)
##    Fertility      Agriculture     Examination      Education    
##  Min.   :35.00   Min.   : 1.20   Min.   : 3.00   Min.   : 1.00  
##  1st Qu.:64.70   1st Qu.:35.90   1st Qu.:12.00   1st Qu.: 6.00  
##  Median :70.40   Median :54.10   Median :16.00   Median : 8.00  
##  Mean   :70.14   Mean   :50.66   Mean   :16.49   Mean   :10.98  
##  3rd Qu.:78.45   3rd Qu.:67.65   3rd Qu.:22.00   3rd Qu.:12.00  
##  Max.   :92.50   Max.   :89.70   Max.   :37.00   Max.   :53.00  
##     Catholic       Infant.Mortality
##  Min.   :  2.150   Min.   :10.80   
##  1st Qu.:  5.195   1st Qu.:18.15   
##  Median : 15.140   Median :20.00   
##  Mean   : 41.144   Mean   :19.94   
##  3rd Qu.: 93.125   3rd Qu.:21.70   
##  Max.   :100.000   Max.   :26.60
str(swiss)
## 'data.frame':	47 obs. of  6 variables:
##  $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
##  $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
##  $ Examination     : int  15 6 5 12 17 9 16 14 12 16 ...
##  $ Education       : int  12 9 5 7 15 7 7 8 7 13 ...
##  $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
##  $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
?swiss
swiss$catholicDummy <- as.factor(swiss$Catholic > 50)
scatterplotSwiss <- ggpairs(data = swiss,
                            lower = list(continuous = "points"),
                            colour = "catholicDummy")

#scatterplotSwiss

See if you can modify the ggpairs code to do something different.

What is the class of base mod? This is a formula object. Lots more on formula

objects, but you will have to read up!

baseMod <- Fertility ~ Agriculture
class(baseMod)
## [1] "formula"
baseMod[[3]] #rhs
## Agriculture
baseMod[[2]] #tilde
## Fertility
baseMod[[1]] #lhs
## `~`

R regression models want a formula, usually as the first term.

baseOutLm <- lm(baseMod, data = swiss)
baseOutGlm <- glm(baseMod, family = gaussian, data = swiss)

class(baseOutLm)
## [1] "lm"
class(baseOutGlm)
## [1] "glm" "lm"
names(baseOutLm)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
names(baseOutGlm)
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"
summary(baseOutLm)
## 
## Call:
## lm(formula = baseMod, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.5374  -7.8685  -0.6362   9.0464  24.4858 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.30438    4.25126  14.185   <2e-16 ***
## Agriculture  0.19420    0.07671   2.532   0.0149 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.82 on 45 degrees of freedom
## Multiple R-squared:  0.1247,	Adjusted R-squared:  0.1052 
## F-statistic: 6.409 on 1 and 45 DF,  p-value: 0.01492
summary(baseOutGlm)
## 
## Call:
## glm(formula = baseMod, family = gaussian, data = swiss)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -25.5374   -7.8685   -0.6362    9.0464   24.4858  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.30438    4.25126  14.185   <2e-16 ***
## Agriculture  0.19420    0.07671   2.532   0.0149 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 139.6248)
## 
##     Null deviance: 7178.0  on 46  degrees of freedom
## Residual deviance: 6283.1  on 45  degrees of freedom
## AIC: 369.47
## 
## Number of Fisher Scoring iterations: 2
#texreg(baseOutLm)
#htmlreg(baseOutLm)
screenreg(baseOutLm)
## 
## ======================
##              Model 1  
## ----------------------
## (Intercept)  60.30 ***
##              (4.25)   
## Agriculture   0.19 *  
##              (0.08)   
## ----------------------
## R^2           0.12    
## Adj. R^2      0.11    
## Num. obs.    47       
## ======================
## *** p < 0.001, ** p < 0.01, * p < 0.05

So what is the difference between a glm() object and and a lm() object?

Let's say we want to compare a bunch of models. It becomes much easier to store them in a list

allMods <- list(baseMod=baseOutLm)
#use dplyr way
allMods[["fullModel"]] <- Fertility ~ .

Lots of ways to add things ta list. Here is one way of naming the models

allMods[["edMod"]] <- Fertility ~ Agriculture + Education + Infant.Mortality
#includes both base terms
allMods[["edModInt"]] <- Fertility ~ Agriculture + Education * Infant.Mortality
allMods[["edModLog"]] <- Fertility ~ Agriculture + I(log(Education))
allMods[["noIntercept"]] <- -1 + Fertility ~ Agriculture + I(log(Education))
#why I()

Now we can use the apply commands we have already learned to run, store and output all of these models very quickly

allModsOut <- lapply(allMods, glm,
                     data = select(swiss, -catholicDummy), # the dplyr way
                     na.action = na.omit
                     )

screenreg(allModsOut)
## 
## ========================================================================================================
##                             baseMod      fullModel    edMod        edModInt     edModLog     noIntercept
## --------------------------------------------------------------------------------------------------------
## (Intercept)                   60.30 ***    66.92 ***    51.10 ***    62.43 ***    87.59 ***    86.59 ***
##                               (4.25)      (10.71)      (10.99)      (17.31)       (9.85)       (9.85)   
## Agriculture                    0.19 *      -0.17 *      -0.03        -0.03         0.00         0.00    
##                               (0.08)       (0.07)       (0.07)       (0.07)       (0.10)       (0.10)   
## Examination                                -0.26                                                        
##                                            (0.25)                                                       
## Education                                  -0.87 ***    -0.86 ***    -1.81                              
##                                            (0.18)       (0.17)       (1.14)                             
## Catholic                                    0.10 **                                                     
##                                            (0.04)                                                       
## Infant.Mortality                            1.08 **      1.49 **      0.93                              
##                                            (0.38)       (0.44)       (0.79)                             
## Education:Infant.Mortality                                            0.05                              
##                                                                      (0.06)                             
## I(log(Education))                                                                 -8.28 **     -8.28 ** 
##                                                                                   (2.74)       (2.74)   
## --------------------------------------------------------------------------------------------------------
## AIC                          369.47       326.07       340.49       341.68       362.62       362.62    
## BIC                          375.02       339.02       349.74       352.79       370.02       370.02    
## Log Likelihood              -181.73      -156.04      -165.24      -164.84      -177.31      -177.31    
## Deviance                    6283.12      2105.04      3114.63      3062.05      5204.79      5204.79    
## Num. obs.                     47           47           47           47           47           47       
## ========================================================================================================
## *** p < 0.001, ** p < 0.01, * p < 0.05

Add several models to you model list and re-run

Ok, we've decided that our best model (not in real life, but for this class) is the model that just uses all of the variables in the data set.

#plot(allModsOut[["fullModel"]])

What is predict going to do? Try to figure it out based on the class of alllModsOut

inSampleLm <- predict.lm(allModsOut[["fullModel"]], interval ="confidence")

inSampleGlm <- predict.glm(allModsOut[["fullModel"]], newdata = NULL, type ="response", se.fit = TRUE)

What do you notice about the difference bteween the two different versions of predict() Which will R run by default? Why do you think that is the case?

We learned a litte about ggplot2 graphics. Let's look a little at base graphics

plot(swiss$Education, inSampleLm[, "fit"],
     type ="n", #don't draw anything just open the plot window
     xlab = "Education",
     ylab = "Fertility",
     main = "Predicted and observed fertility rates for in sample provinces",
     ylim = c(min(inSampleLm) - 1, max(inSampleLm) - 1),
     xlim = c(min(swiss$Education) - 1, max(swiss$Education)  + 1)
     )

#now draw things #pch controls shape
points(swiss$Education, inSampleLm[, "fit"], col="red", pch = 23)
points(swiss$Education, swiss[, "Fertility"], col="blue", cex = .6)

segments(swiss$Education, inSampleLm[,"lwr"],
         swiss$Education, inSampleLm[, "upr"],
         col = "red",
         )

lines(lowess(inSampleLm[, "fit"] ~swiss$Education),  # Add lowess
      col="red",                                     # Color
      lwd= 2                                         # Thickness
      )

lines(lowess(with(swiss, Fertility ~ Education)),  # Add lowess
      col="blue",                                  # Color
      lwd= 2                                       # Thickness
      )

legTxt <- c("red = Predicted", "blue = Observed") # Text for legend

legend(list(x = 30, y = 80),           # location also see locator()
       legend = legTxt,                # text 
       col    = c("red", "blue"),      # colors
       lty    = c(1, 1),               # line type
       merge  = TRUE                   # Merge points and lines
       )
text(swiss$Education[swiss$Education > 20], inSampleLm[swiss$Education > 20, "fit"],
     row.names(swiss[swiss$Education > 20, ]),
     cex = 0.6,
     pos = 2,
     col = "black"
     )

Ok, we've looked at in-sample data. Now let's look at out of sample predictions. Really, we should simulate here, but I leave that to your statistics class

edVector <- seq(min(swiss$Education), max(swiss$Education))

hypotheticalData <- data.frame(Education = edVector,
                               Agriculture = mean(swiss$Agriculture),
                               Examination = mean(swiss$Examination),
                               Catholic = mean(swiss$Catholic),
                               Infant.Mortality = mean(swiss$Infant.Mortality)
                               )

hypotheticalData2 <- data.frame(Education = edVector,
                                Agriculture = max(swiss$Agriculture),
                                Examination = mean(swiss$Examination),
                                Catholic = mean(swiss$Catholic),
                                Infant.Mortality = mean(swiss$Infant.Mortality)
                                )

outSampleLm <- predict.lm(allModsOut[["fullModel"]],
                          newdata  = hypotheticalData,
                          interval = "confidence"
                          )

outSampleLm2 <- predict.lm(allModsOut[["fullModel"]],
                           newdata  = hypotheticalData2,
                           interval = "confidence"
                           )

plot(edVector, outSampleLm[, "fit"],
     type="n",
     xlab = "Education",
     ylab = "Fertility",
     main = "Predicted Fertility rates for different levels of education, 
     \nholding agriculture at different levels",
     ylim = c(min(outSampleLm) - 1, max(outSampleLm) - 1),
     xlim = c(min(swiss$Education) - 1, max(swiss$Education)  + 1)
     )

lines(edVector, outSampleLm[, "fit"], col = "red")
lines(edVector, outSampleLm[, "lwr"], lty = 3,  col = "red")
lines(edVector, outSampleLm[, "upr"], lty = 3, col = "red")

lines(edVector, outSampleLm2[, "fit"], col = "blue")
lines(edVector, outSampleLm2[, "lwr"], lty = 3,  col = "blue")
lines(edVector, outSampleLm2[, "upr"], lty = 3, col = "blue")

ggplot2 and base graphics operate quite differently. You cannnot store base graphics in an object. ggplot2 is set up specifically to do this.

wontWork <- plot(edVector, outSampleLm[, "fit"],
                 type="n",
                 xlab = "Education",
                 ylab = "Fertility",
                 main = "Predicted Fertility rates for different levels of education, 
                 \nholding agriculture at different levels",
                 ylim = c(min(outSampleLm) - 1, max(outSampleLm) - 1),
                 xlim = c(min(swiss$Education) - 1, max(swiss$Education)  + 1)
                 )
print(wontWork)
## NULL
class(wontWork)
## [1] "NULL"

If you want to write out a a pdf file from base graphics, you need to open a pdf driver Think of this as printing to a pdf rather than a window. Remember, to either set the working directory or include the entire file path

## pdf(file = "myNewFile.pdf",
#    width = 6, 
#    height = 9
#  )

  ##whatever call to base graphics here
  ##e.g. plot(x,  y)

# dev.off() #stop writing to that PDF!

Back to ggplot2

willWork <- qplot(edVector, outSampleLm[, "fit"])
print(willWork) #
head(willWork)
## $data
## data frame with 0 columns and 0 rows
## 
## $layers
## $layers[[1]]
## geom_point:  
## stat_identity:  
## position_identity: (width = NULL, height = NULL)
## 
## 
## $scales
## Reference class object of class "Scales"
## Field "scales":
## list()
## 
## $mapping
## List of 2
##  $ x: symbol edVector
##  $ y: language outSampleLm[, "fit"]
## 
## $theme
## list()
## 
## $coordinates
## $limits
## $limits$x
## NULL
## 
## $limits$y
## NULL
## 
## 
## attr(,"class")
## [1] "cartesian" "coord"
class(willWork)
## [1] "gg"     "ggplot"
# ggsave(filename = "myGGplot.pdf", 
#        plot = willWork, 
#        width = 12, 
#        height = 15, 
#        units = "cm"
#        )