Today we are going to combine our skills with regression and programming. But first let's learn more about the R regreession object
rm(list=ls()) library(ggplot2) library(dplyr) #nice scatterplots if(!require(GGally)){ install.packages("GGally") library(GGally) } #nice LaTeX output and screen output if(!require(texreg)){ install.packages("texreg") library(texreg) }
Tried and true -- looking out our data.
data(swiss) summary(swiss)
## Fertility Agriculture Examination Education ## Min. :35.00 Min. : 1.20 Min. : 3.00 Min. : 1.00 ## 1st Qu.:64.70 1st Qu.:35.90 1st Qu.:12.00 1st Qu.: 6.00 ## Median :70.40 Median :54.10 Median :16.00 Median : 8.00 ## Mean :70.14 Mean :50.66 Mean :16.49 Mean :10.98 ## 3rd Qu.:78.45 3rd Qu.:67.65 3rd Qu.:22.00 3rd Qu.:12.00 ## Max. :92.50 Max. :89.70 Max. :37.00 Max. :53.00 ## Catholic Infant.Mortality ## Min. : 2.150 Min. :10.80 ## 1st Qu.: 5.195 1st Qu.:18.15 ## Median : 15.140 Median :20.00 ## Mean : 41.144 Mean :19.94 ## 3rd Qu.: 93.125 3rd Qu.:21.70 ## Max. :100.000 Max. :26.60
str(swiss)
## 'data.frame': 47 obs. of 6 variables: ## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ... ## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ... ## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ... ## $ Education : int 12 9 5 7 15 7 7 8 7 13 ... ## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ... ## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
?swiss swiss$catholicDummy <- as.factor(swiss$Catholic > 50) scatterplotSwiss <- ggpairs(data = swiss, lower = list(continuous = "points"), colour = "catholicDummy") #scatterplotSwiss
See if you can modify the ggpairs code to do something different.
What is the class of base mod? This is a formula
object. Lots more on formula
baseMod <- Fertility ~ Agriculture class(baseMod)
## [1] "formula"
baseMod[[3]] #rhs
## Agriculture
baseMod[[2]] #tilde
## Fertility
baseMod[[1]] #lhs
## `~`
R regression models want a formula
, usually as the first term.
baseOutLm <- lm(baseMod, data = swiss) baseOutGlm <- glm(baseMod, family = gaussian, data = swiss) class(baseOutLm)
## [1] "lm"
class(baseOutGlm)
## [1] "glm" "lm"
names(baseOutLm)
## [1] "coefficients" "residuals" "effects" "rank" ## [5] "fitted.values" "assign" "qr" "df.residual" ## [9] "xlevels" "call" "terms" "model"
names(baseOutGlm)
## [1] "coefficients" "residuals" "fitted.values" ## [4] "effects" "R" "rank" ## [7] "qr" "family" "linear.predictors" ## [10] "deviance" "aic" "null.deviance" ## [13] "iter" "weights" "prior.weights" ## [16] "df.residual" "df.null" "y" ## [19] "converged" "boundary" "model" ## [22] "call" "formula" "terms" ## [25] "data" "offset" "control" ## [28] "method" "contrasts" "xlevels"
summary(baseOutLm)
## ## Call: ## lm(formula = baseMod, data = swiss) ## ## Residuals: ## Min 1Q Median 3Q Max ## -25.5374 -7.8685 -0.6362 9.0464 24.4858 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 60.30438 4.25126 14.185 <2e-16 *** ## Agriculture 0.19420 0.07671 2.532 0.0149 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 11.82 on 45 degrees of freedom ## Multiple R-squared: 0.1247, Adjusted R-squared: 0.1052 ## F-statistic: 6.409 on 1 and 45 DF, p-value: 0.01492
summary(baseOutGlm)
## ## Call: ## glm(formula = baseMod, family = gaussian, data = swiss) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -25.5374 -7.8685 -0.6362 9.0464 24.4858 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 60.30438 4.25126 14.185 <2e-16 *** ## Agriculture 0.19420 0.07671 2.532 0.0149 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for gaussian family taken to be 139.6248) ## ## Null deviance: 7178.0 on 46 degrees of freedom ## Residual deviance: 6283.1 on 45 degrees of freedom ## AIC: 369.47 ## ## Number of Fisher Scoring iterations: 2
#texreg(baseOutLm) #htmlreg(baseOutLm) screenreg(baseOutLm)
## ## ====================== ## Model 1 ## ---------------------- ## (Intercept) 60.30 *** ## (4.25) ## Agriculture 0.19 * ## (0.08) ## ---------------------- ## R^2 0.12 ## Adj. R^2 0.11 ## Num. obs. 47 ## ====================== ## *** p < 0.001, ** p < 0.01, * p < 0.05
So what is the difference between a glm()
object and and a lm()
object?
Let's say we want to compare a bunch of models. It becomes much easier to store them in a list
allMods <- list(baseMod=baseOutLm) #use dplyr way allMods[["fullModel"]] <- Fertility ~ .
Lots of ways to add things ta list. Here is one way of naming the models
allMods[["edMod"]] <- Fertility ~ Agriculture + Education + Infant.Mortality #includes both base terms allMods[["edModInt"]] <- Fertility ~ Agriculture + Education * Infant.Mortality allMods[["edModLog"]] <- Fertility ~ Agriculture + I(log(Education)) allMods[["noIntercept"]] <- -1 + Fertility ~ Agriculture + I(log(Education)) #why I()
Now we can use the apply commands we have already learned to run, store and output all of these models very quickly
allModsOut <- lapply(allMods, glm, data = select(swiss, -catholicDummy), # the dplyr way na.action = na.omit ) screenreg(allModsOut)
## ## ======================================================================================================== ## baseMod fullModel edMod edModInt edModLog noIntercept ## -------------------------------------------------------------------------------------------------------- ## (Intercept) 60.30 *** 66.92 *** 51.10 *** 62.43 *** 87.59 *** 86.59 *** ## (4.25) (10.71) (10.99) (17.31) (9.85) (9.85) ## Agriculture 0.19 * -0.17 * -0.03 -0.03 0.00 0.00 ## (0.08) (0.07) (0.07) (0.07) (0.10) (0.10) ## Examination -0.26 ## (0.25) ## Education -0.87 *** -0.86 *** -1.81 ## (0.18) (0.17) (1.14) ## Catholic 0.10 ** ## (0.04) ## Infant.Mortality 1.08 ** 1.49 ** 0.93 ## (0.38) (0.44) (0.79) ## Education:Infant.Mortality 0.05 ## (0.06) ## I(log(Education)) -8.28 ** -8.28 ** ## (2.74) (2.74) ## -------------------------------------------------------------------------------------------------------- ## AIC 369.47 326.07 340.49 341.68 362.62 362.62 ## BIC 375.02 339.02 349.74 352.79 370.02 370.02 ## Log Likelihood -181.73 -156.04 -165.24 -164.84 -177.31 -177.31 ## Deviance 6283.12 2105.04 3114.63 3062.05 5204.79 5204.79 ## Num. obs. 47 47 47 47 47 47 ## ======================================================================================================== ## *** p < 0.001, ** p < 0.01, * p < 0.05
Add several models to you model list and re-run
Ok, we've decided that our best model (not in real life, but for this class) is the model that just uses all of the variables in the data set.
#plot(allModsOut[["fullModel"]])
What is predict going to do? Try to figure it out based on the class of alllModsOut
inSampleLm <- predict.lm(allModsOut[["fullModel"]], interval ="confidence") inSampleGlm <- predict.glm(allModsOut[["fullModel"]], newdata = NULL, type ="response", se.fit = TRUE)
What do you notice about the difference bteween the two different versions of predict()
Which will R run by default? Why do you think that is the case?
We learned a litte about ggplot2
graphics. Let's look a little at base
graphics
plot(swiss$Education, inSampleLm[, "fit"], type ="n", #don't draw anything just open the plot window xlab = "Education", ylab = "Fertility", main = "Predicted and observed fertility rates for in sample provinces", ylim = c(min(inSampleLm) - 1, max(inSampleLm) - 1), xlim = c(min(swiss$Education) - 1, max(swiss$Education) + 1) ) #now draw things #pch controls shape points(swiss$Education, inSampleLm[, "fit"], col="red", pch = 23) points(swiss$Education, swiss[, "Fertility"], col="blue", cex = .6) segments(swiss$Education, inSampleLm[,"lwr"], swiss$Education, inSampleLm[, "upr"], col = "red", ) lines(lowess(inSampleLm[, "fit"] ~swiss$Education), # Add lowess col="red", # Color lwd= 2 # Thickness ) lines(lowess(with(swiss, Fertility ~ Education)), # Add lowess col="blue", # Color lwd= 2 # Thickness ) legTxt <- c("red = Predicted", "blue = Observed") # Text for legend legend(list(x = 30, y = 80), # location also see locator() legend = legTxt, # text col = c("red", "blue"), # colors lty = c(1, 1), # line type merge = TRUE # Merge points and lines ) text(swiss$Education[swiss$Education > 20], inSampleLm[swiss$Education > 20, "fit"], row.names(swiss[swiss$Education > 20, ]), cex = 0.6, pos = 2, col = "black" )
Ok, we've looked at in-sample data. Now let's look at out of sample predictions. Really, we should simulate here, but I leave that to your statistics class
edVector <- seq(min(swiss$Education), max(swiss$Education)) hypotheticalData <- data.frame(Education = edVector, Agriculture = mean(swiss$Agriculture), Examination = mean(swiss$Examination), Catholic = mean(swiss$Catholic), Infant.Mortality = mean(swiss$Infant.Mortality) ) hypotheticalData2 <- data.frame(Education = edVector, Agriculture = max(swiss$Agriculture), Examination = mean(swiss$Examination), Catholic = mean(swiss$Catholic), Infant.Mortality = mean(swiss$Infant.Mortality) ) outSampleLm <- predict.lm(allModsOut[["fullModel"]], newdata = hypotheticalData, interval = "confidence" ) outSampleLm2 <- predict.lm(allModsOut[["fullModel"]], newdata = hypotheticalData2, interval = "confidence" ) plot(edVector, outSampleLm[, "fit"], type="n", xlab = "Education", ylab = "Fertility", main = "Predicted Fertility rates for different levels of education, \nholding agriculture at different levels", ylim = c(min(outSampleLm) - 1, max(outSampleLm) - 1), xlim = c(min(swiss$Education) - 1, max(swiss$Education) + 1) ) lines(edVector, outSampleLm[, "fit"], col = "red") lines(edVector, outSampleLm[, "lwr"], lty = 3, col = "red") lines(edVector, outSampleLm[, "upr"], lty = 3, col = "red") lines(edVector, outSampleLm2[, "fit"], col = "blue") lines(edVector, outSampleLm2[, "lwr"], lty = 3, col = "blue") lines(edVector, outSampleLm2[, "upr"], lty = 3, col = "blue")
ggplot2
and base
graphics operate quite differently. You cannnot store base
graphics in an object. ggplot2
is set up specifically to do this.
wontWork <- plot(edVector, outSampleLm[, "fit"], type="n", xlab = "Education", ylab = "Fertility", main = "Predicted Fertility rates for different levels of education, \nholding agriculture at different levels", ylim = c(min(outSampleLm) - 1, max(outSampleLm) - 1), xlim = c(min(swiss$Education) - 1, max(swiss$Education) + 1) )
print(wontWork)
## NULL
class(wontWork)
## [1] "NULL"
If you want to write out a a pdf file from base
graphics, you need to open a pdf driver Think of this as printing to a pdf rather than a window. Remember, to either set the working directory or include the entire file path
## pdf(file = "myNewFile.pdf", # width = 6, # height = 9 # ) ##whatever call to base graphics here ##e.g. plot(x, y) # dev.off() #stop writing to that PDF!
Back to ggplot2
willWork <- qplot(edVector, outSampleLm[, "fit"]) print(willWork) #
head(willWork)
## $data ## data frame with 0 columns and 0 rows ## ## $layers ## $layers[[1]] ## geom_point: ## stat_identity: ## position_identity: (width = NULL, height = NULL) ## ## ## $scales ## Reference class object of class "Scales" ## Field "scales": ## list() ## ## $mapping ## List of 2 ## $ x: symbol edVector ## $ y: language outSampleLm[, "fit"] ## ## $theme ## list() ## ## $coordinates ## $limits ## $limits$x ## NULL ## ## $limits$y ## NULL ## ## ## attr(,"class") ## [1] "cartesian" "coord"
class(willWork)
## [1] "gg" "ggplot"
# ggsave(filename = "myGGplot.pdf", # plot = willWork, # width = 12, # height = 15, # units = "cm" # )