Appendix: Hidden R Code
The code chunks copied below contain can be used to replicate the figures from Introduction to Political and Social Data Analysis (using R). In many cases, you need to load one of the data sets used in the textbook, so make sure you have copied them all to your working directory.
load("anes20.rda")
load("cces20.rda")
load("countries2.rda")
load("county20.rda")
load("county20large.rda")
load("gdpvotes.rda")
load("votes1620.rda")
You should also make sure you have installed all of the packages used in the textbook, plus the MASS
and plotrix
packages.
Figure 1.2 Simulated examples of Strong and Weak Relationships
par(mfrow=c(1,2))
set.seed(135)
samples = 25
r =.85
library('MASS')
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
X = data[, 1] # standard normal (mu=0, sd=1)
Y = data[, 2] # standard normal (mu=0, sd=1)
samples = 25
r =.35
data2 = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
X1 = data2[, 1] # standard normal (mu=0, sd=1)
Y1 = data2[, 2] # standard normal (mu=0, sd=1)
plot(X1,Y1, xlab="Independent Variable A",
ylab="Dependent Variable", main="Weak Relationship",
sub="(Correlation=.35)")
abline(lm(Y1~X1))
plot(X,Y, xlab="Independent Variable B",
ylab="Dependent Variable", main="Strong Relationship",
sub="(Correlation=.85)")
abline(lm(Y~X))
par(mfrow=c(1,1))
Figure 1.3 A Simple Test of the Retrospective Voting Hypothesis
plot(gdpvotes$gdpchng, gdpvotes$vote, cex=.5,
xlab="Percentage Change in Real GDP Per Capita, Q1 to Q3",
ylab="Incumbent Percent of Two party Vote",
ylim=c(45,65))
abline(lm( gdpvotes$vote~gdpvotes$gdpchng))
text(gdpvotes$gdpchng, gdpvotes$vote, gdpvotes$year, pos=3, cex=.5)
Figure 1.4 Testing the Retrospective Voting Hypothesis in Two Different Contexts
par(mfrow=c(1,2))
plot(gdpvotes$gdpchng[gdpvotes$openseat==0],
gdpvotes$vote[gdpvotes$openseat==0], cex=.5,
ylim=c(40,65), main="Incumbent Running",
xlab="Change in GDP Per Capita, Q1 to Q3",
ylab="Incumbent Percent of Two party Vote",
sub = "(Correlation=.68)")
abline(lm(gdpvotes$vote[gdpvotes$openseat==0]~
gdpvotes$gdpchng[gdpvotes$openseat==0]))
text(gdpvotes$gdpchng[gdpvotes$openseat==0],
gdpvotes$vote[gdpvotes$openseat==0],
gdpvotes$year[gdpvotes$openseat==0], pos=3, cex=.5)
plot(gdpvotes$gdpchng[gdpvotes$openseat==1],
gdpvotes$vote[gdpvotes$openseat==1], cex=.5,
ylim=c(40,65), main="Open Seat",
xlab="Change in GDP Per Capita, Q1 to Q3",
ylab="Incumbent Percent of Two party Vote",
sub = "(Correlation=.16)")
abline(lm(gdpvotes$vote[gdpvotes$openseat==1]~
gdpvotes$gdpchng[gdpvotes$openseat==1]))
text(gdpvotes$gdpchng[gdpvotes$openseat==1],
gdpvotes$vote[gdpvotes$openseat==1],
gdpvotes$year[gdpvotes$openseat==1],
pos=3, cex=.5)
par(mfrow=c(1,1))
Figure 5.3 Illustrations of Different Levels of Skewness
par(mfrow=c(2,2))
#skewed distributions
set.seed(5)
y <- rnorm(10000)^2
ydens=density(y)
plot(ydens, xlim=c(-.5,6), cex.main=.8,
main="Positive (Right) Skew: Mean>Median",
lwd=2,
xlab="X")
abline(v=mean(y))
abline(v=median(y), lty=2)
legend("topright", legend=c("Mean", "Median"),
lty=1:2, cex=.7)
set.seed(5)
y <-(-1*rnorm(10000)^2)
y<- 6+y
ydens=density(y)
plot(ydens, cex.main=.8,
main="Negative (Left) Skew: Mean<Median",
xlab="X", xlim=c(0,6.5), lwd=2)
abline(v=mean(y))
abline(v=median(y), lty=2)
legend("topleft", legend=c("Mean", "Median"),
lty=1:2, cex=.7)
#No Skew
set.seed(12)
x <- seq(0, 6, length=1000)
y <- dnorm(x, mean=3, sd=.75)
plot(x,y, type="l", lwd=2,
main="No Skew (Mean=Median)",
cex.main=.8, xlab="X",
ylab="Density")
abline(v=3)
legend("topright", legend=c("Mean, Median"),
lty=c(1), cex=.7)
par(mfrow=c(1,1))
Figure 6.1 Distributions with Identical Central Tendencies but Different Levels of Dispersion
par(mfrow=c(2,2))
x <- seq(4, 16, length=500)
y <- dnorm(x, mean=10, sd=.5)
plot(x, y, type="l", ylab="Density",
lwd=1, main="Mean=10, Median=10",
cex.axis=.7, cex.main=1.3)
x <- seq(4, 16, length=500)
y <- dnorm(x, mean=10, sd=1.1)
plot(x, y, type="l", ylab="Density",
lwd=1, main="Mean=10, Median=10",
cex.axis=.7, cex.main=1.3)
x <- seq(4, 16, length=500)
y <- dnorm(x, mean=10, sd=2.5)
plot(x, y, type="l", ylab="Density",
lwd=1, main="Mean=10, Median=10",
cex.axis=.7, cex.main=1.3)
par(mfrow=c(1,1))
Figure 6.4 Comparing the Empirical Histogram for Age with the Normal Curve
#First, create cces20$age, using cces20$birthyr
cces20$age<-2020-cces20$birthyr
hist(cces20$age, xlab="Age of respondent",
main="",
prob=T,
xlim=c(0,100), ylim=c(0,.03))
abline(v= c(66), lty=2,lwd=2)
curve(dnorm(x,mean=mean(cces20$age,
na.rm=T),
sd=sd(cces20$age, na.rm=T)),
lwd=2, add=T)
legend("topright", legend=c("Age=66", "Normal Curve"),
lty=c(2,1), lwd=c(2,2), cex=.9)
Figure 7.2 Simulated Results from Large and Small Coin Toss Samples
#Set seed to ensure identical results
set.seed(2020)
#Create a new object, "Coin", with two outcomes, Heads and Tails
coin <- c("Heads", "Tails")
#Take a random sample of outcomes from ten tosses
coin10=sample(coin, 10, rep=T)
#Show the breakdown of coins tosses
tosses<-c(10,20,60,100,150,300,500,1000,
2000,400,750,1500,1250,1750)
Heads<-c(30,33.3,40,58,50,51,52,50.9,50.5,
50,52,49.7, 48.2,52.6)
coin_tosses<-data.frame(tosses,Heads)
plot(coin_tosses$tosses,coin_tosses$Heads,
xlab="Number of Coin Tosses",
ylab="Percent Heads", ylim=c(25,65))
abline(h=50)
Figure 7.3 Simulated Results from Large and Small Samples of Die Rolls
set.seed(2020)
#Create a new object, "die", outcomes 1,2,3,4,5,6
die <- c(1,2,3,4,5,6)
die100=sample(die, 100, replace = T)
die2000=sample(die, 2000, replace = T)
par(mfrow=c(1,2))
barplot(prop.table(table(die100)),
main="100 Rolls", ylab="Propotion",
xlab="Ouctome",
ylim=c(0,.25))
abline(h=.167, lwd=2)
barplot(prop.table(table(die2000)),
main="2000 Rolls", ylab="Propotion",
xlab="Outcome",
ylim=c(0,.25))
abline(h=.167, lwd=2)
par(mfrow=c(1,1))
Figure 8.1 A Theoretical Sampling Distribution, mu=34.04, Sample Size=300
#normal dist
x <- seq(30, 38, by = .025)
# mean as 34.04 and standard deviation (sd/sqrt(300)).
y <- dnorm(x, mean = 34.04,
sd = sd(county20$d2pty20)/sqrt(300))
plot(x,y, type="l", ylab="Density",
lwd=2, xlab="Distribution of Sample Means")
axis(1, at=c(30:38))
abline(v=34.04)
legend("topright",
legend=c("Mu=34.04"),lty=1, cex=.8)
Figure 8.2 Normal Distribution and Simulated Sampling Distribution, 50 Samples of 50 Counties
set.seed(251)
#create an object (sample_means50) with space to
#store fifty sample means
sample_means50 <- rep(NA, 50)
# Run through the data 50 times, getting 50-county samples
#Store the fifty samples in "sample_means50
for(i in 1:50){
samp <- sample(county20$d2pty20, 50)
sample_means50[i] <- mean((samp), na.rm=T)
}
##Create Plots
d50<-density(sample_means50)
plot(d50, ylim=c(0,.19),xlab ="Sample Means",
lwd=2, xlim=c(26,42), xaxt="n", main="")
curve(dnorm(x,mean=mean(sample_means50),
sd=sd(sample_means50)),
lwd=2, lty=2,add=T)
axis(1,at=c(26:42))
legend("topright", legend = c("Sampling Dist", "Normal Dist"),
lty=1:2, cex=.8)
Figure 8.3 Normal Distribution and Sampling Distribution, 500 Samples of 50 Counties
#Gather sample means from 500 samples of 50 counties
set.seed(251)
sample_means500 <- rep(NA, 50)
for(i in 1:500){
samp <- sample(county20$d2pty20, 50)
sample_means500[i] <- mean((samp), na.rm=T)
}
d500<-density(sample_means500)
plot(d500, xlab ="Sample Means",lwd=2,
main="", ylim=c(0,.18),
xlim=c(26,42), xaxt="n")
axis(1, at=c(26:42))
curve(dnorm(x,mean=mean(sample_means500),
sd=sd(sample_means500)),
lwd=2, lty=2,add=T)
legend("topright", legend = c("Sampling Dist", "Normal Dist"),
lty=1:2, cex=.8)
Figure 8.4 Fifty 68 Percent Confidence Intervals, n=100, mu=34.04
#Generate CI data for fifty samples
samp_mean <- rep(NA, 50)
samp_sd <- rep(NA, 50)
n <- 100
# obtain a sample of size n = 100 from the population
set.seed(250)
for(i in 1:50){
samp <- sample(county20$d2pty20, n)
samp_mean[i] <- mean(samp) # save sample mean of each sample
samp_sd[i] <- sd(samp) # save sample sd of each sample
}
lower_vector <- samp_mean - samp_sd / sqrt(n)
upper_vector <- samp_mean + samp_sd / sqrt(n)
y<-samp_mean
cidata<-data.frame(y,lower_vector,upper_vector)
#Generate plots
library(plotrix)
plotCI(1:50,y,ui=upper_vector, li=lower_vector,
ylab="Sample Estimates", xlab="Sample")
#Identify "misses"
points(x = 2,y = 34.04,pch = 16)
points(x = 4,y = 34.04,pch = 16)
points(x = 6,y = 34.04,pch = 16)
points(x = 11,y = 34.04,pch = 16)
points(x = 14,y = 34.04,pch = 16)
points(x = 17,y = 34.04,pch = 16)
points(x = 20,y = 34.04,pch = 16)
points(x = 21,y = 34.04,pch = 16)
points(x = 36,y = 34.04,pch = 16)
points(x = 39,y = 34.04,pch = 16)
points(x = 43,y = 34.04,pch = 16)
points(x = 48,y = 34.04,pch = 16)
abline(h=34.04)
Figure 8.5 Width of a 95 Percent Confidence Interval at Different Sample Sizes (Std Dev.=15.89)
obs <- seq(100, 3000, 100)
width <- 2*(1.96 * (15.89/sqrt(obs)))
plot(width ~ obs,
ylab = "Width of Confidence Interval",
xlab = "Sample Size", type="l")
Figure 8.6 Normal and Sampling Distribution (Proportion), 500 Samples of 50 Counties
# Create samples
set.seed(251)
demwin<-as.numeric(county20$d2pty20 >50)
#Create an object with space to store 50 sample means
sample_prop500 <- rep(NA, 50)
#run through the date 500 times, 50-county samples of 'demwin'
#each time. Store the mean of each sample in 'sample_prop500'
for(i in 1:500){
samp <- sample(demwin, 50)
sample_prop500[i] <- mean((samp), na.rm=T)
}
#Plot data
dwin500<-density(sample_prop500)
plot(dwin500, lwd=2, main="",
xlab="Sample Proportions")
curve(dnorm(x,mean=mean(sample_prop500),
sd=sd(sample_prop500)), lwd=2, lty=2,add=T)
legend("topright",
legend = c("Sampling Dist", "Normal Dist"),
lty=1:2, cex=.8)
Figure 9.1
x <- seq(-4,4,by=.05)
yz <- dnorm(x)
plot(x, yz, type="l",
ylab="Density", xlab="z-score",
ylim=c(0,.43))
segments(x0=-1.96,y0=-.01,x1=-1.96,y1=.06,lty=2)
segments(x0=0,y0=-.01,x1=0,y1=.396,lwd=3)
text(0,.415,expression(mu))
text(-1.3,0, expression(z[bar(x)]==-1.96), cex=.8)
legend(-4.5, .35,legend=c("What is the probability",
"of getting a sample",
"outcome of this",
"value..."),
bty="n", cex=.85)
legend("topright",legend=c("...if the population value",
"is equal to this?"),
bty="n", cex=.85)
arrows(x0 = -3,
x1 = -1.96,
y0 = .21,
y1 = .06, lwd=1.5)
arrows(x0 = 2.2,
x1 = 0,
y0 = .36,
y1 = .25, lwd=1.5)
Figure 9.2 An Illustration of Key Concepts in Hypothesis Testing
x <- seq(53, 65, length=1000)
y <- dnorm(x, mean=59.2, sd=1.538)
plot(x, y, type="l", lwd=1, main="",
xlab="Average Annual Sick Leave Hours",
ylab="Density", cex=.8, yaxs = "i")
segments(x0=59.2,y0=-.03,x1=59.2,y1=.2594)
segments(x0=56.19,y0=-.07,x1=56.19,y1=.05, lty=2,lwd=3)
segments(x0=54.8,y0=-.05,x1=54.8,y1=.02, lwd = 3)
legend("topright",
legend=c("Mu", "C.V.","Sample Mean"),
lty =c(1, 2, 1),lwd =c(1, 2, 3), cex=.8)
Figure 9.3 Critical Values for One and Two-tailed Tests
x <- seq(-3.5, 3.5, length=1000)
y <- dnorm(x, mean=0, sd=1)
plot(x, y, type="l", lwd=1, main="",
xlab="Critical Values for p=.05, (one-tailed and two-tailed)", ylab="Density",yaxs = "i")
legend("topright",
legend = c("One-tailed, z= -1.65", "Two-tailed, z=+/-1.96"),
lty=1:2, cex=0.8)
segments(x0=-1.65,y0=-.02,x1=-1.65,y1=.098, )
segments(x0=-1.96,y0=-.02,x1=-1.96,y1=.06,lty=2)
segments(x0=1.96,y0=-.02,x1=1.96,y1=.06,lty=2)
Figure 9.4 Comparison of Normal and t-Distributions
x <- seq(-5,5,by=.05)
yz <- dnorm(x)
y3 <- dt(x,df=3)
plot(x,y3, type="l", lty=2, lwd=2 ,ylim=c(0,.40),
ylab="Density",
xlab="t(z)-score")
lines(x,yz, lty=1, lwd=2)
legend("topright",
legend=c("t-distribution", "Normal Distribution"),
lty=c(2,1), lwd = c(2,2), cex=.75)
Figure 9.5 Degrees of Freedom and Resemblance of t-distribution to the Normal Distribution
x <- seq(-3,3,by=.05)
yz <- dnorm(x)
y3 <- dt(x,df=3)
y5 <- dt(x,df=5)
y7 <- dt(x,df=7)
plot(x,y7, type="l", lty=2,ylim=c(0,.40),
ylab="Density", xlab="t-score")
lines(x,y5, lty=3)
lines(x,y3, lty=4)
lines(x,yz,lty=1)
legend("topright",
legend=c("Normal", "df=7", "df=5", "df=3"),
lty=c(1,2,3,4), cex=.8)
Figure 10.1
x <- seq(-4,4,by=.05)
yt <- dt(x,df=500)
plot(x, yt, type="l",
ylab="Density", xlab="t-score",
ylim=c(0,.43))
segments(x0=-1.96,y0=-.01,x1=-1.96,y1=.06,lty=2)
segments(x0=0,y0=-.01,x1=0,y1=.396,lwd=3)
text(0,.41,expression(mu["1"] - mu["2"]))
text(-1.1,0, expression(t[(bar(x)["1"]-bar(x)["2"])]==-1.96), cex=.8)
legend(-4.5, .35,legend=c("What is the probability",
"of getting a sample",
"difference of this",
"size..."),
bty="n", cex=.85)
legend("topright",legend=c("...if the difference in the",
"population is equal to this?"),
bty="n", cex=.85)
arrows(x0 = -3.2,
x1 = -1.96,
y0 = .21,
y1 = .06, lwd=1.5)
arrows(x0 = 2,
x1 = 0,
y0 = .36,
y1 = .25, lwd=1.5)
Figure 11.1 F-Distribution and Critical Value for dfw=179, dfb=3, p=.05
df1<-3
df2<-179
curve(df(x, df1 = df1, df2 = df2),
from = 0, to = 7, xlab="F-Ratio",
ylab="Density", main="",
ylim=c(0,.75),cex.main=.8, yaxs = "i")
#abline(v=2.66)
segments(x0=2.66,y0=.00,x1=2.66,y1=.067, lwd=2 )
legend("topright", legend=c("C.V. = 2.66"), lwd=2, cex=.9)
Figure 12.1 Example of a Mosiac Plot
library(descr)
#Create religious importance variable
anes20$relig_imp<-anes20$V201433
#Recode Religious Importance to three categories
levels(anes20$relig_imp)<-c("High","High","Moderate", "Low","Low")
#Change order to Low-Moderate-High
anes20$relig_imp<-ordered(anes20$relig_imp,
levels=c("Low","Moderate", "High"))
#Use "plot=T" to get mosaic plot, add graphing details
tab_plots<-crosstab(anes20$relig_imp, anes20$V203003, prop.c=T,
plot=T,
chisq=T,
xlab="Region of Residence",
ylab="Religious Importance",
main="Mosaic Plot: Religious Importance by region")
Figure 12.2 Chi-Square Distribution, df=6
curve(dchisq(x, df = 6), from = 0,
to = 25, xlab="Chi-square",
ylab="Density", yaxs = "i",
xaxs = "i",cex.main=.8, ylim=c(0,.15))
segments(x0=12.592, y0=0, x1=12.592, y1=.017, lwd=2)
#abline(v=12.59)
legend("topright", legend=c("cv = 12.592"),
lty=1,lwd=2, cex=.8)
Figure 14.1 Generic Positive and Negative Correlations and Scatterplot Patterns
library(MASS)
set.seed(35)
par(mfrow=c(2,3))
samples = 200
r =-.5
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
Xneg.5 = data[, 1] # standard normal (mu=0, sd=1)
Yneg.5 = data[, 2] # standard normal (mu=0, sd=1)
samples = 200
r =-.25
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
Xneg.25 = data[, 1] # standard normal (mu=0, sd=1)
Yneg.25 = data[, 2] # standard normal (mu=0, sd=1)
samples = 200
r =-.75
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
Xneg.75 = data[, 1] # standard normal (mu=0, sd=1)
Yneg.75 = data[, 2] # standard normal (mu=0, sd=1)
samples = 200
r =.5
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
Xpos.5 = data[, 1] # standard normal (mu=0, sd=1)
Ypos.5 = data[, 2] # standard normal (mu=0, sd=1)
samples = 200
r =.25
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
Xpos.25 = data[, 1] # standard normal (mu=0, sd=1)
Ypos.25 = data[, 2] # standard normal (mu=0, sd=1)
samples = 200
r =.75
data = mvrnorm(n=samples, mu=c(0, 0),
Sigma=matrix(c(1, r, r, 1), nrow=2),
empirical=TRUE)
Xpos.75 = data[, 1] # standard normal (mu=0, sd=1)
Ypos.75 = data[, 2] # standard normal (mu=0, sd=1)
plot(Xneg.25, Yneg.25, main="r = -.25",
xlab="X", ylab="Y", cex=.8,cex.main=1.3, pch=20)
plot(Xneg.5, Yneg.5, main="r =-.50",
xlab="X", ylab="Y",cex=.8, cex.main=1.3,pch=20)
plot(Xneg.75, Yneg.75, main="r = -.75",
xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)
plot(Xpos.25, Ypos.25, main="r = .25",
xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)
plot(Xpos.5, Ypos.5, main="r =.50",
xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)
plot(Xpos.75, Ypos.75, main="r = .75",
xlab="X", ylab="Y",cex=.8,cex.main=1.3, pch=20)
par(mfrow=c(1,1))
Figure 15.1 The Regession Line Superimposed in the Scatterplot
plot(votes1620$vote16, votes1620$vote20,
xlab="Dem two-party % 2016",
ylab="Dem two-party % 2020", ylim=c(35,60))
abline(lm(votes1620$vote20~votes1620$vote16))
arrows(x0 = 39.3,
x1 = 39.3,
y0 = 34.1,
y1 = 39.5, lwd=1.5)
arrows(x0 = 39.3,
x1 = 34.7,
y0 = 39.5,
y1 = 39.5, lwd=1.5)
legend(38,50, legend=expression(hat(y)=="-4.289 + 1.115x"),
bty = "n", cex=.8)
Figure 15.2 Prediction Errors (Residuals) in the Regression Model
plot(votes1620$vote16, votes1620$vote20,
xlab="Dem two-party % 2016",
ylab="Dem two-party % 2020", ylim=c(35,60))
abline(lm(votes1620$vote20~votes1620$vote16))
segments(x0=35.6,y0=35.43,x1=35.6,y1=37.1 )
segments(x0=49.4,y0=50.81,x1=49.4,y1=48.3 )
segments(x0=51.5,y0=53.15,x1=51.5,y1=54.7 )
segments(x0=50.2,y0=51.70,x1=50.2,y1=53.8 )
segments(x0=58.3,y0=60.74,x1=58.3,y1=60.6 )
segments(x0=39.3,y0=39.55,x1=39.3,y1=37.6 )
segments(x0=49.6,y0=51.03,x1=49.6,y1=50.3 )
Figure 15.3 Prediction Error without a Regression Model
plot(votes1620$vote16, votes1620$vote20,
xlab="Dem two-party % 2016",
ylab="Dem two-party % 2020", ylim=c(35,60))
abline(h=mean(votes1620$vote20))
segments(x0=35.6,y0=48.914,x1=35.6,y1=37.1 )
segments(x0=49.4,y0=48.914,x1=49.4,y1=48.3 )
segments(x0=51.5,y0=48.914,x1=51.5,y1=54.7 )
segments(x0=50.2,y0=48.914,x1=50.2,y1=53.8 )
segments(x0=58.3,y0=48.914,x1=58.3,y1=60.6 )
segments(x0=39.3,y0=48.914,x1=39.3,y1=37.6 )
segments(x0=49.6,y0=48.914,x1=49.6,y1=50.3 )
legend(41,52, legend="Mean=48.914", bty="n", cex=.85)
Figure 17.1 Alternative Models for the Relationship between Doctors per 10k and Life Expectancy
#Life expectancy by "docs10k"
fit_raw<-lm(countries2$lifexp~countries2$docs10k,
na.action=na.exclude)
#Life expectancy by "log10(docs10k)"
fit_log<-lm(countries2$lifexp~log10(countries2$docs10k),
na.action=na.exclude)
countries2$yhat<-predict(fit_raw)
countries2$yhatlog<-predict(fit_log)
plot(countries2$docs10k,countries2$lifexp,
xlab="Doctors per 10k Population",
ylab="Life Expectancy")
curve(63.4 + 9.5824*log10(x), .01, 85,
lwd=2,add=T)
abline(fit_raw, lty=2, lwd=2)
legend("right",
legend=c(expression(hat(y)=="a + b*x"),
expression(hat(y)=="a + b*log10(x)")),
lty=c(2,1), lwd=c(2,2), cex=.8)
Figure 17.2 A Segmented View of a Curvilinear Relationship
lowdoc<-subset(countries2,docs10k <=14.781)
hidoc<-subset(countries2,docs10k >14.781)
par(mfrow = c(1,2))
plot(lowdoc$docs10k, lowdoc$lifexp,
ylim=c(50,85),
xlab="Doctors per 10k",
ylab="Life Expectancy", sub=" (r=.67)",
main= "LT Median Docs/10k",
cex.main=.8)
abline(lm(lowdoc$lifexp~lowdoc$docs10k))
plot(hidoc$docs10k, hidoc$lifexp,
ylim=c(50,85), xlab="Doctors per 10k",
ylab="Life Expectancy", sub="(r=.21)",
main= "GT Median Docs/10k",
cex.main=.8)
abline(lm(hidoc$lifexp~hidoc$docs10k))
par(mfrow = c(1,1))
Figure 17.3 The Impact of Scale of the Independent Variable on the Size of Regression Coefficients
countries2$urban_prop<-countries2$urban/100
par(mfrow = c(1,2))
plot(countries2$urban, countries2$lifexp,
xlab="Percent Urban", ylab="Life Expectancy",
main="y= 61.4 + .192x", cex.main=.9)
abline(lm(countries2$lifexp~countries2$urban), lwd=2)
plot(countries2$urban_prop, countries2$lifexp,
xlab="Proportion Urban",
ylab="Life Expectancy",
main="y= 61.4 + 19.2x", cex.main=.9)
abline(lm(countries2$lifexp~countries2$urban_prop), lwd=2)
par(mfrow=c(1,1))
Figure 18.1 Scatterplots Can be Used to Evaluate Linearity
par(mfrow = c(2,3))
plot(countries2$fert1520, countries2$lifexp,
ylab="Life Expectancy",
xlab="Fertility Rate")
plot(countries2$mnschool, countries2$lifexp,
ylab="Life Expectancy",
xlab="Mean Years School")
plot(log10(countries2$pop19_M), countries2$lifexp,
ylab="Life Expectancy",
xlab="Log of Population")
plot(countries2$urban, countries2$lifexp,
ylab="Life Expectancy",
xlab="Percent Urban")
plot(countries2$docs10k, countries2$lifexp,
ylab="Life Expectancy",
xlab="Doctors per 10k Population")
par(mfrow = c(1,1))
Figure 18.2 A Non-constant Error Pattern