Appendix: Hidden R Code

The code chunks copied below contain can be used to replicate the figures from Introduction to Political and Social Data Analysis (using R). In many cases, you need to load one of the data sets used in the textbook, so make sure you have copied them all to your working directory.

load("anes20.rda")
load("cces20.rda")
load("countries2.rda")
load("county20.rda")
load("county20large.rda")
load("gdpvotes.rda")
load("votes1620.rda")

You should also make sure you have installed all of the packages used in the textbook, plus the MASS and plotrix packages.

Figure 1.2 Simulated examples of Strong and Weak Relationships

par(mfrow=c(1,2))
set.seed(135)
samples = 25
r =.85

library('MASS')
data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
X = data[, 1]  # standard normal (mu=0, sd=1)
Y = data[, 2]  # standard normal (mu=0, sd=1)

samples = 25
r =.35

data2 = mvrnorm(n=samples, mu=c(0, 0), 
                Sigma=matrix(c(1, r, r, 1), nrow=2), 
                empirical=TRUE)
X1 = data2[, 1]  # standard normal (mu=0, sd=1)
Y1 = data2[, 2]  # standard normal (mu=0, sd=1)

plot(X1,Y1, xlab="Independent Variable A", 
     ylab="Dependent Variable", main="Weak Relationship",
     sub="(Correlation=.35)")
abline(lm(Y1~X1))

plot(X,Y, xlab="Independent Variable B", 
     ylab="Dependent Variable", main="Strong Relationship",
     sub="(Correlation=.85")
abline(lm(Y~X))

par(mfrow=c(1,1))

Figure 1.3 A Simple Test of the Retrospective Voting Hypothesis

plot(gdpvotes$gdpchng, gdpvotes$vote, cex=.5, 
     xlab="Percentage Change in Real GDP Per Capita, Q1 to Q3",
     ylab="Incumbent Percent of Two party Vote", 
     ylim=c(45,65))
abline(lm( gdpvotes$vote~gdpvotes$gdpchng))
text(gdpvotes$gdpchng, gdpvotes$vote, gdpvotes$year, pos=3, cex=.5)

Figure 1.4 Testing the Retrospective Voting Hypothesis in Two Different Contexts

par(mfrow=c(1,2))
plot(gdpvotes$gdpchng[gdpvotes$openseat==0],
     gdpvotes$vote[gdpvotes$openseat==0], cex=.5,
     ylim=c(40,65), main="Incumbent Running",
     xlab="Change in GDP Per Capita, Q1 to Q3",
     ylab="Incumbent Percent of Two party Vote", 
     sub = "(Correlation=.68)")
abline(lm(gdpvotes$vote[gdpvotes$openseat==0]~
            gdpvotes$gdpchng[gdpvotes$openseat==0]))
text(gdpvotes$gdpchng[gdpvotes$openseat==0],
     gdpvotes$vote[gdpvotes$openseat==0],
     gdpvotes$year[gdpvotes$openseat==0],  pos=3, cex=.5)

plot(gdpvotes$gdpchng[gdpvotes$openseat==1],
     gdpvotes$vote[gdpvotes$openseat==1], cex=.5,
     ylim=c(40,65), main="Open Seat",
     xlab="Change in GDP Per Capita, Q1 to Q3",
     ylab="Incumbent Percent of Two party Vote", 
     sub = "(Correlation=.16)")
abline(lm(gdpvotes$vote[gdpvotes$openseat==1]~
            gdpvotes$gdpchng[gdpvotes$openseat==1]))
text(gdpvotes$gdpchng[gdpvotes$openseat==1],
     gdpvotes$vote[gdpvotes$openseat==1],
     gdpvotes$year[gdpvotes$openseat==1],  
     pos=3, cex=.5)
par(mfrow=c(1,1))

Figure 5.3 Illustrations of Different Levels of Skewness

par(mfrow=c(2,2))
#skewed distributions
set.seed(5)
y <- rnorm(10000)^2
ydens=density(y)
plot(ydens, xlim=c(-.5,6), cex.main=.8,
     main="Positive (Right) Skew: Mean>Median",
     lwd=2, 
     xlab="X")
abline(v=mean(y))
abline(v=median(y), lty=2)
legend("topright", legend=c("Mean", "Median"),
       lty=1:2, cex=.7)

set.seed(5)
y <-(-1*rnorm(10000)^2)
y<- 6+y
ydens=density(y)
plot(ydens, cex.main=.8,
     main="Negative (Left) Skew: Mean<Median",
     xlab="X", xlim=c(0,6.5), lwd=2)
abline(v=mean(y))
abline(v=median(y), lty=2)
legend("topleft", legend=c("Mean", "Median"),
      lty=1:2, cex=.7)

#No Skew
set.seed(12)
x <- seq(0, 6, length=1000)
y <- dnorm(x, mean=3, sd=.75)
plot(x,y, type="l", lwd=2,  
     main="No Skew (Mean=Median)", 
     cex.main=.8, xlab="X", 
     ylab="Density")
abline(v=3)
legend("topright", legend=c("Mean, Median"), 
       lty=c(1), cex=.7)
par(mfrow=c(1,1))

Figure 6.1 Distributions with Identical Central Tendencies but Different Levels of Dispersion

par(mfrow=c(2,2))
x <- seq(4, 16, length=500)
y <- dnorm(x, mean=10, sd=.5)
plot(x, y, type="l", ylab="Density", 
     lwd=1, main="Mean=10, Median=10", 
      cex.axis=.7, cex.main=1.3)
x <- seq(4, 16, length=500)
y <- dnorm(x, mean=10, sd=1.1)
plot(x, y, type="l", ylab="Density", 
     lwd=1, main="Mean=10, Median=10", 
      cex.axis=.7, cex.main=1.3)

x <- seq(4, 16, length=500)
y <- dnorm(x, mean=10, sd=2.5)
plot(x, y, type="l", ylab="Density", 
     lwd=1, main="Mean=10, Median=10", 
     cex.axis=.7, cex.main=1.3)

par(mfrow=c(1,1))

Figure 6.4 Comparing the Empirical Histogram for Age with the Normal Curve

#First, create cces20$age, using cces20$birthyr
cces20$age<-2020-cces20$birthyr
hist(cces20$age, xlab="Age of respondent", 
     main="", 
     prob=T,
     xlim=c(0,100), ylim=c(0,.03))
abline(v= c(66), lty=2,lwd=2)
curve(dnorm(x,mean=mean(cces20$age, 
                        na.rm=T), 
            sd=sd(cces20$age, na.rm=T)), 
      lwd=2, add=T)
legend("topright", legend=c("Age=66", "Normal Curve"),
       lty=c(2,1), lwd=c(2,2), cex=.9)

Figure 7.2 Simulated Results from Large and Small Coin Toss Samples

#Set seed to ensure identical results
set.seed(2020) 
#Create a new object, "Coin", with two outcomes, Heads and Tails
coin <- c("Heads", "Tails")
#Take a random sample of outcomes from ten tosses 
coin10=sample(coin, 10, rep=T)
#Show the breakdown of coins tosses
tosses<-c(10,20,60,100,150,300,500,1000,
          2000,400,750,1500,1250,1750)
Heads<-c(30,33.3,40,58,50,51,52,50.9,50.5,
         50,52,49.7, 48.2,52.6)
coin_tosses<-data.frame(tosses,Heads)

plot(coin_tosses$tosses,coin_tosses$Heads, 
     xlab="Number of Coin Tosses",
     ylab="Percent Heads", ylim=c(25,65))
abline(h=50)

Figure 7.3 Simulated Results from Large and Small Samples of Die Rolls

set.seed(2020)
#Create a new object, "die", outcomes 1,2,3,4,5,6
die <- c(1,2,3,4,5,6)
die100=sample(die, 100, replace = T)
die2000=sample(die, 2000, replace = T)
par(mfrow=c(1,2))
barplot(prop.table(table(die100)), 
     main="100 Rolls", ylab="Propotion",
     xlab="Ouctome", 
     ylim=c(0,.25))
abline(h=.167, lwd=2)
barplot(prop.table(table(die2000)), 
     main="2000 Rolls", ylab="Propotion",
     xlab="Outcome", 
     ylim=c(0,.25))
abline(h=.167, lwd=2)

par(mfrow=c(1,1))

Figure 8.1 A Theoretical Sampling Distribution, mu=34.04, Sample Size=300

#normal dist
x <- seq(30, 38, by = .025)
# mean as 34.04 and standard deviation (sd/sqrt(300)).
y <- dnorm(x, mean = 34.04, 
           sd = sd(county20$d2pty20)/sqrt(300))

plot(x,y, type="l", ylab="Density", 
     lwd=2, xlab="Distribution of Sample Means")
axis(1, at=c(30:38))
abline(v=34.04)
legend("topright", 
       legend=c("Mu=34.04"),lty=1, cex=.8)

Figure 8.2 Normal Distribution and Simulated Sampling Distribution, 50 Samples of 50 Counties

set.seed(251)
#create an object (sample_means50) with space to 
#store fifty sample means
sample_means50 <- rep(NA, 50)
# Run through the data 50 times, getting 50-county samples 
#Store the fifty samples in "sample_means50
for(i in 1:50){
  samp <- sample(county20$d2pty20, 50)
  sample_means50[i] <- mean((samp), na.rm=T)
}

##Create Plots
d50<-density(sample_means50)
plot(d50, ylim=c(0,.19),xlab ="Sample Means",
     lwd=2, xlim=c(26,42), xaxt="n", main="")
curve(dnorm(x,mean=mean(sample_means50),
            sd=sd(sample_means50)), 
      lwd=2, lty=2,add=T)
axis(1,at=c(26:42))
legend("topright", legend = c("Sampling Dist", "Normal Dist"),
       lty=1:2, cex=.8)

Figure 8.3 Normal Distribution and Sampling Distribution, 500 Samples of 50 Counties

#Gather sample means from 500 samples of 50 counties
set.seed(251)
sample_means500 <- rep(NA, 50)
for(i in 1:500){
  samp <- sample(county20$d2pty20, 50)
  sample_means500[i] <- mean((samp), na.rm=T)
}


d500<-density(sample_means500)
plot(d500, xlab ="Sample Means",lwd=2, 
     main="", ylim=c(0,.18),
     xlim=c(26,42), xaxt="n")
axis(1, at=c(26:42))
curve(dnorm(x,mean=mean(sample_means500),
            sd=sd(sample_means500)),
      lwd=2, lty=2,add=T)
legend("topright", legend = c("Sampling Dist", "Normal Dist"), 
       lty=1:2, cex=.8)

Figure 8.4 Fifty 68 Percent Confidence Intervals, n=100, mu=34.04

#Generate CI data for fifty samples
samp_mean <- rep(NA, 50)
samp_sd <- rep(NA, 50)
n <- 100

# obtain a sample of size n = 100 from the population
set.seed(250)
for(i in 1:50){
  samp <- sample(county20$d2pty20, n) 
  samp_mean[i] <- mean(samp) # save sample mean of each sample
  samp_sd[i] <- sd(samp) # save sample sd of each sample
}

lower_vector <- samp_mean - samp_sd / sqrt(n) 
upper_vector <- samp_mean + samp_sd / sqrt(n)
y<-samp_mean

cidata<-data.frame(y,lower_vector,upper_vector)

#Generate plots
library(plotrix)
plotCI(1:50,y,ui=upper_vector, li=lower_vector, 
       ylab="Sample Estimates", xlab="Sample")

#Identify "misses"
points(x = 2,y = 34.04,pch = 16)
points(x = 4,y = 34.04,pch = 16)
points(x = 6,y = 34.04,pch = 16)
points(x = 11,y = 34.04,pch = 16)
points(x = 14,y = 34.04,pch = 16)
points(x = 17,y = 34.04,pch = 16)
points(x = 20,y = 34.04,pch = 16)
points(x = 21,y = 34.04,pch = 16)
points(x = 36,y = 34.04,pch = 16)
points(x = 39,y = 34.04,pch = 16)
points(x = 43,y = 34.04,pch = 16)
points(x = 48,y = 34.04,pch = 16)
abline(h=34.04)

Figure 8.5 Width of a 95 Percent Confidence Interval at Different Sample Sizes (Std Dev.=15.89)

obs <- seq(100, 3000, 100)
width <- 2*(1.96 * (15.89/sqrt(obs)))
plot(width ~ obs, 
     ylab = "Width of Confidence Interval", 
     xlab = "Sample Size", type="l")

Figure 8.6 Normal and Sampling Distribution (Proportion), 500 Samples of 50 Counties

# Create samples
set.seed(251)
demwin<-as.numeric(county20$d2pty20 >50)
#Create an object with space to store 50 sample means
sample_prop500 <- rep(NA, 50)
#run through the date 500 times, 50-county samples of 'demwin'
#each time.  Store the mean of each sample in 'sample_prop500'
for(i in 1:500){
  samp <- sample(demwin, 50)
  sample_prop500[i] <- mean((samp), na.rm=T)
}

#Plot data
dwin500<-density(sample_prop500)
plot(dwin500, lwd=2, main="", 
    xlab="Sample Proportions")
curve(dnorm(x,mean=mean(sample_prop500),
            sd=sd(sample_prop500)), lwd=2, lty=2,add=T)
legend("topright", 
       legend = c("Sampling Dist", "Normal Dist"), 
       lty=1:2, cex=.8)

Figure 9.2 An Illustration of Key Concepts in Hypothesis Testing

x <- seq(53, 65, length=1000)
y <- dnorm(x, mean=59.2, sd=1.538)
plot(x, y, type="l", lwd=1, main="", 
     xlab="Average Annual Sick Leave Hours", 
     ylab="Density", cex=.8, yaxs = "i")

segments(x0=59.2,y0=-.03,x1=59.2,y1=.2594)
segments(x0=56.19,y0=-.07,x1=56.19,y1=.05, lty=2,lwd=3)
segments(x0=54.8,y0=-.05,x1=54.8,y1=.02, lwd = 3)
legend("topright",  
       legend=c("Mu", "C.V.","Sample Mean"), 
       lty =c(1, 2, 1),lwd =c(1, 2, 3), cex=.8)

Figure 9.3 Critical Values for One and Two-tailed Tests

x <- seq(-3.5, 3.5, length=1000)
y <- dnorm(x, mean=0, sd=1)
plot(x, y, type="l", lwd=1, main="", 
     xlab="Critical Values for p=.05, (one-tailed and two-tailed)", ylab="Density",yaxs = "i")
legend("topright", 
       legend = c("One-tailed, z= -1.65", "Two-tailed, z=+/-1.96"), 
       lty=1:2, cex=0.8)

segments(x0=-1.65,y0=-.02,x1=-1.65,y1=.098, )
segments(x0=-1.96,y0=-.02,x1=-1.96,y1=.06,lty=2)
segments(x0=1.96,y0=-.02,x1=1.96,y1=.06,lty=2)

Figure 9.4 Comparison of Normal and t-Distributions

x <- seq(-5,5,by=.05)
yz <- dnorm(x)
y3 <- dt(x,df=3)
plot(x,y3, type="l", lty=2, lwd=2 ,ylim=c(0,.40),
      ylab="Density", 
     xlab="t(z)-score")
lines(x,yz, lty=1, lwd=2)
legend("topright", 
       legend=c("t-distribution", "Normal Distribution"), 
       lty=c(1,2), lwd = c(2,2), cex=.75)

Figure 9.5 Degrees of Freedom and Resemblance of t-distribution to the Normal Distribution

x <- seq(-3,3,by=.05)
yz <- dnorm(x)
y3 <- dt(x,df=3)
y5 <- dt(x,df=5)
y7 <- dt(x,df=7)
plot(x,y7, type="l", lty=2,ylim=c(0,.40), 
     ylab="Density", xlab="t-score")
lines(x,y5, lty=3)
lines(x,y3, lty=4)
lines(x,yz,lty=1)
legend("topright", 
       legend=c("Normal", "df=7", "df=5", "df=3"), 
       lty=c(1,2,3,4), cex=.8)

Figure 11.1 F-Distribution and Critical Value for dfw=179, dfb=3, p=.05

df1<-3
df2<-179
curve(df(x, df1 = df1, df2 = df2), 
      from = 0, to = 7, xlab="F-Ratio",
      ylab="Density", main="",
      ylim=c(0,.75),cex.main=.8, yaxs = "i")
#abline(v=2.66)
segments(x0=2.66,y0=.00,x1=2.66,y1=.067, lwd=2 )

legend("topright", legend=c("C.V. = 2.66"), lwd=2, cex=.9)

Figure 12.1 Example of a Mosiac Plot

library(descr)
#Create religious importance variable
anes20$relig_imp<-anes20$V201433
#Recode Religious Importance to three categories
levels(anes20$relig_imp)<-c("High","High","Moderate", "Low","Low") 
#Change order to Low-Moderate-High 
anes20$relig_imp<-ordered(anes20$relig_imp, 
                          levels=c("Low","Moderate", "High"))

#Use "plot=T" to get mosaic plot, add graphing details
tab_plots<-crosstab(anes20$relig_imp, anes20$V203003, prop.c=T, 
         plot=T,
         chisq=T,
         xlab="Region of Residence", 
         ylab="Religious Importance",
         main="Mosaic Plot: Religious Importance by region")

Figure 12.2 Chi-Square Distribution, df=6

curve(dchisq(x, df = 6), from = 0, 
      to = 25, xlab="Chi-square",
      ylab="Density", yaxs = "i",
      xaxs = "i",cex.main=.8)
segments(x0=12.592, y0=0, x1=12.592, y1=.017, lwd=2)
#abline(v=12.59)
legend("topright", legend=c("C.V. = 12.592"),
       lty=1,lwd=2, cex=.8)

Figure 14.1 Generic Positive and Negative Correlations and Scatterplot Patterns

library(MASS)
set.seed(35)
par(mfrow=c(2,3))
samples = 200
r =-.5

data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
Xneg.5 = data[, 1]  # standard normal (mu=0, sd=1)
Yneg.5 = data[, 2]  # standard normal (mu=0, sd=1)

samples = 200
r =-.25

data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
Xneg.25 = data[, 1]  # standard normal (mu=0, sd=1)
Yneg.25 = data[, 2]  # standard normal (mu=0, sd=1)

samples = 200
r =-.75

data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
Xneg.75 = data[, 1]  # standard normal (mu=0, sd=1)
Yneg.75 = data[, 2]  # standard normal (mu=0, sd=1)

samples = 200
r =.5

data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
Xpos.5 = data[, 1]  # standard normal (mu=0, sd=1)
Ypos.5 = data[, 2]  # standard normal (mu=0, sd=1)


samples = 200
r =.25

data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
Xpos.25 = data[, 1]  # standard normal (mu=0, sd=1)
Ypos.25 = data[, 2]  # standard normal (mu=0, sd=1)

samples = 200
r =.75

data = mvrnorm(n=samples, mu=c(0, 0), 
               Sigma=matrix(c(1, r, r, 1), nrow=2), 
               empirical=TRUE)
Xpos.75 = data[, 1]  # standard normal (mu=0, sd=1)
Ypos.75 = data[, 2]  # standard normal (mu=0, sd=1)

plot(Xneg.25, Yneg.25, main="r = -.25", 
     xlab="X", ylab="Y", cex=.8,cex.main=1.3, pch=20)
plot(Xneg.5, Yneg.5, main="r =-.50", 
     xlab="X", ylab="Y",cex=.8, cex.main=1.3,pch=20)
plot(Xneg.75, Yneg.75, main="r = -.75", 
     xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)

plot(Xpos.25, Ypos.25, main="r = .25", 
     xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)
plot(Xpos.5, Ypos.5, main="r =.50", 
     xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)
plot(Xpos.75, Ypos.75, main="r = .75", 
     xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)

par(mfrow=c(1,1))

Figure 15.1 The Regession Line Superimposed in the Scatterplot

plot(votes1620$vote16, votes1620$vote20, 
     xlab="Dem two-party % 2016", 
     ylab="Dem two-party % 2020", ylim=c(35,60))
abline(lm(votes1620$vote20~votes1620$vote16))
arrows(x0 = 39.3,
       x1 = 39.3,
       y0 = 34.1,
       y1 = 39.5, lwd=1.5) 
arrows(x0 = 39.3,
       x1 = 34.7,
       y0 = 39.5,
       y1 = 39.5, lwd=1.5)
legend(38,50, 
       legend="y=-4.289 + 1.115x", bty = "n", cex=.8)

Figure 15.2 Prediction Errors (Residuals) in the Regression Model

plot(votes1620$vote16, votes1620$vote20, 
     xlab="Dem two-party % 2016", 
     ylab="Dem two-party % 2020", ylim=c(35,60))
abline(lm(votes1620$vote20~votes1620$vote16))
segments(x0=35.6,y0=35.43,x1=35.6,y1=37.1 )
segments(x0=49.4,y0=50.81,x1=49.4,y1=48.3 )
segments(x0=51.5,y0=53.15,x1=51.5,y1=54.7 )
segments(x0=50.2,y0=51.70,x1=50.2,y1=53.8 )
segments(x0=58.3,y0=60.74,x1=58.3,y1=60.6 )
segments(x0=39.3,y0=39.55,x1=39.3,y1=37.6 )
segments(x0=49.6,y0=51.03,x1=49.6,y1=50.3 )

Figure 15.3 Prediction Error without a Regression Model

plot(votes1620$vote16, votes1620$vote20, 
     xlab="Dem two-party % 2016", 
     ylab="Dem two-party % 2020", ylim=c(35,60))
abline(h=mean(votes1620$vote20))
segments(x0=35.6,y0=48.914,x1=35.6,y1=37.1 )
segments(x0=49.4,y0=48.914,x1=49.4,y1=48.3 )
segments(x0=51.5,y0=48.914,x1=51.5,y1=54.7 )
segments(x0=50.2,y0=48.914,x1=50.2,y1=53.8 )
segments(x0=58.3,y0=48.914,x1=58.3,y1=60.6 )
segments(x0=39.3,y0=48.914,x1=39.3,y1=37.6 )
segments(x0=49.6,y0=48.914,x1=49.6,y1=50.3 )
legend(41,52, legend="Mean=48.914", bty="n", cex=.85)

Figure 17.1 Alternative Models for the Relationship between Doctors per 10k and Life Expectancy

#Life expectancy by "docs10k"
fit_raw<-lm(countries2$lifexp~countries2$docs10k,
            na.action=na.exclude)
#Life expectancy by "log10(docs10k)"
fit_log<-lm(countries2$lifexp~log10(countries2$docs10k),
            na.action=na.exclude)
countries2$yhat<-predict(fit_raw)
countries2$yhatlog<-predict(fit_log)

plot(countries2$docs10k,countries2$lifexp, 
     xlab="Doctors per 10k Population",
     ylab="Life Expectancy")
curve(63.4 + 9.5824*log10(x), .01, 85, 
      lwd=2,add=T)
abline(fit_raw, lty=2, lwd=2)
legend("right", 
       legend=c("y= a + b*x", "y= a + b*log10(x)"), 
       lty=c(2,1), lwd=c(2,2), cex=.8)

Figure 17.2 A Segmented View of a Curvilinear Relationship

lowdoc<-subset(countries2,docs10k <=14.781)
hidoc<-subset(countries2,docs10k >14.781)
par(mfrow = c(1,2))
plot(lowdoc$docs10k, lowdoc$lifexp, 
     ylim=c(50,85),  
     xlab="Doctors per 10k", 
     ylab="Life Expectancy", sub=" (r=.67)",
     main= "LT Median Docs/10k",
     cex.main=.8)
abline(lm(lowdoc$lifexp~lowdoc$docs10k))
plot(hidoc$docs10k, hidoc$lifexp, 
     ylim=c(50,85), xlab="Doctors per 10k", 
     ylab="Life Expectancy", sub="(r=.21)",
     main= "GT Median Docs/10k",
     cex.main=.8)
abline(lm(hidoc$lifexp~hidoc$docs10k))
par(mfrow = c(1,1))

Figure 17.3 The Impact of Scale of the Independent Variable on the Size of Regression Coefficients

countries2$urban_prop<-countries2$urban/100
par(mfrow = c(1,2))
plot(countries2$urban, countries2$lifexp,
     xlab="Percent Urban", ylab="Life Expectancy", 
     main="y= 61.4 + .192x", cex.main=.9)
abline(lm(countries2$lifexp~countries2$urban), lwd=2)
plot(countries2$urban_prop, countries2$lifexp, 
     xlab="Proportion Urban", 
     ylab="Life Expectancy", 
     main="y= 61.4 + 19.2x", cex.main=.9)
abline(lm(countries2$lifexp~countries2$urban_prop), lwd=2)

par(mfrow=c(1,1))

Figure 18.1 Scatterplots Can be Used to Evaluate Linearity

par(mfrow = c(2,3))
plot(countries2$fert1520, countries2$lifexp, 
     ylab="Life Expectancy", 
     xlab="Fertility Rate")
plot(countries2$mnschool, countries2$lifexp, 
     ylab="Life Expectancy", 
     xlab="Mean Years School")
plot(log10(countries2$pop19_M), countries2$lifexp, 
     ylab="Life Expectancy", 
     xlab="Log of Population")
plot(countries2$urban, countries2$lifexp, 
     ylab="Life Expectancy", 
     xlab="Percent Urban")
plot(countries2$docs10k, countries2$lifexp, 
     ylab="Life Expectancy", 
     xlab="Doctors per 10k Population")
par(mfrow = c(1,1))

Figure 18.2 A Non-constant Error Pattern

set.seed(135)
n      = rep(1:100,2)
a      = 0
b      = 1
sigma2 = n^1.3
eps    = rnorm(n,mean=0,sd=sqrt(sigma2))
y      = a+b*n + eps
mod    = lm(y ~ n)

yhat<-(-1*predict(mod))+100
#Plot residuals from model as a function of predicted values
plot(yhat, residuals(mod), 
     xlab="Predicted Outcome", 
     ylab="Residuals")