Exercise solutions
Chapter 2
Exercise 2.2
tab <- matrix(c(139, 443, 230, 502), nrow = 2, ncol = 2,
dimnames = list("Outcome" = c("Dead", "Alive"),
Smoker = c("Yes", "No")))
#### equivalent
tab <- matrix(c(139, 443, 230, 502), 2, 2)
dimnames(tab) <- list("Outcome" = c("Dead", "Alive"),
Smoker = c("Yes", "No"))
#### equivalent
tab <- matrix(c(139, 443, 230, 502), 2, 2)
rownames(tab) <- c("Dead", "Alive")
colnames(tab) <- c("Yes", "No")
names(dimnames(tab)) <- c("Outcome", "Smoker")
Exercise 2.3
Using the tab
object from previous solution, study and practice the
following R code to recreate Table 2.29.
rowt <- apply(tab, 1, sum)
tab2 <- cbind(tab, Total = rowt)
colt <- apply(tab2, 2, sum)
tab2 <- rbind(tab2, Total = colt)
names(dimnames(tab2)) <- c("Outcome", "Smoker")
Exercise 2.4
Study and execute the following R code:
rowt <- apply(tab, 1, sum) # row distrib
rowd <- sweep(tab, 1, rowt, "/")
colt <- apply(tab, 2, sum) # col distrib
cold <- sweep(tab, 2, colt, "/")
jtd <- tab/sum(tab) # joint distrib
ptabs <- list(row.distribution = rowd, col.distribution = cold,
joint.distribution = jtd)
Exercise 2.5
Using the tab2
object from previous solution, study and practice the
following R code to recreate Table 2.30. Note that the
column distributions could also have been used.
risk = tab2[1, 1:2]/tab2[3, 1:2]
risk.ratio <- risk/risk[2]
odds <- risk/(1 - risk)
odds.ratio <- odds/odds[2]
results <- rbind(risk, risk.ratio, odds, odds.ratio)
results.rounded <- round(results, 2)
Interpretation: The risk of death among non-smokers is higher than the risk of death among smokers, suggesting that there may be some confounding.
Exercise 2.6
wdat.oas <- xtabs(~outcome + age4 + smoker, data = wdat)
wdat.tot.oas <- apply(wdat.oas, c(2, 3), sum)
wdat.risk.oas <- sweep(wdat.oas, c(2, 3), wdat.tot.oas, "/")
wdat.risk.oas2 <- round(wdat.risk.oas, 2)
Interpretation: The risk of death is not larger in non-smokers, in fact it is larger among smokers in older age groups.
Chapter 3
Exercise 3.1
Approach 1: do the arithmetic using R as a calculator with number counts from Table 3.14.
pR1.D0 <- (234+55)/(270+80)
pR1.D1 <- (81+192)/(87+263)
pR1.D0Gm <- 234/270
pR1.D1Gm <- 81/87
pR1.D0Gw <- 55/80
pR1.D1Gw <- 192/263
Approach 2: do the arithmetic using R objects.
jp2 <-
read.csv('https://raw.githubusercontent.com/taragonmd/data/master/drugrx-pearl2.csv')
str(jp2)
tab2.rdg <- xtabs(~ Recovered + Drug + Gender, data = jp2)
tab2.rd <- apply(tab2.rdg, c(1,2), sum)
pR1.D0 <- tab2.rd['Yes','No']/sum(tab2.rd[,'No'])
pR1.D1 <- tab2.rd['Yes','Yes']/sum(tab2.rd[,'Yes'])
pR1.D0Gm <- tab2.rdg['Yes','No','Men']/sum(tab2.rdg[,'No','Men'])
pR1.D1Gm <- tab2.rdg['Yes','Yes','Men']/sum(tab2.rdg[,'Yes','Men'])
pR1.D0Gw <- tab2.rdg['Yes','No','Women']/sum(tab2.rdg[,'No','Women'])
pR1.D1Gw <- tab2.rdg['Yes','Yes','Women']/sum(tab2.rdg[,'Yes','Women'])
Approach 3: do the arithmetic using apply
and sweep
functions. Study and try this solution.
Approach 4: do the arithmetic using prop.table
functions. Study and
try this solution.
Exercise 3.2
- Evaluate the structure of data frame.
std89c <-
read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89c.csv")
str(std89c)
head(std89c)
lapply(std89c, table)
- The
Age
variable levels are not ordered correctly. This is because R, by default, converts character vectors into factors with the levels in “alphabetical” order. One option is to setas.is=TRUE
when usingread.csv
, and set the factor levels after reading in the data.
std89c<-
read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89c.csv",
as.is = c(FALSE, FALSE, TRUE))
str(std89c)
table(std89c$Age)
agelab<-c("<=14","15-19","20-24","25-29","30-34","35-44","45-54",">55")
std89c$Age <- factor(std89c$Age, levels = agelab)
table(std89c$Age)
- Create 3-dimensional arrays using both the
table
andxtabs
function, with and without attaching the data frame.
#> , , = Female
#>
#>
#> <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 165 92 2257 4503 3590 2628 1505 392
#> Other 11 15 158 307 283 167 149 40
#> White 14 24 253 475 433 316 243 55
#>
#> , , = Male
#>
#>
#> <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 31 823 1412 4059 4121 4453 3858 1619
#> Other 7 108 210 654 633 520 492 202
#> White 2 216 88 407 550 564 654 323
#> , , Sex = Female
#>
#> Age
#> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 165 92 2257 4503 3590 2628 1505 392
#> Other 11 15 158 307 283 167 149 40
#> White 14 24 253 475 433 316 243 55
#>
#> , , Sex = Male
#>
#> Age
#> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 31 823 1412 4059 4121 4453 3858 1619
#> Other 7 108 210 654 633 520 492 202
#> White 2 216 88 407 550 564 654 323
#> , , Sex = Female
#>
#> Age
#> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 165 92 2257 4503 3590 2628 1505 392
#> Other 11 15 158 307 283 167 149 40
#> White 14 24 253 475 433 316 243 55
#>
#> , , Sex = Male
#>
#> Age
#> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 31 823 1412 4059 4121 4453 3858 1619
#> Other 7 108 210 654 633 520 492 202
#> White 2 216 88 407 550 564 654 323
#> , , Sex = Female
#>
#> Age
#> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 165 92 2257 4503 3590 2628 1505 392
#> Other 11 15 158 307 283 167 149 40
#> White 14 24 253 475 433 316 243 55
#>
#> , , Sex = Male
#>
#> Age
#> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54
#> Black 31 823 1412 4059 4121 4453 3858 1619
#> Other 7 108 210 654 633 520 492 202
#> White 2 216 88 407 550 564 654 323
Exercise 3.3
Here we use the apply
function to get marginal totals for the
syphilis 3-dimensional array.
tab.ars <- xtabs(~ Age + Race + Sex, data = std89c)
#### 2-D tables
tab.ar <- apply(tab.ars, c(1, 2), sum); tab.ar
tab.as <- apply(tab.ars, c(1, 3), sum); tab.as
tab.rs <- apply(tab.ars, c(2, 3), sum); tab.rs
#### 1-D tables
tab.a <- apply(tab.ars, 1, sum); tab.a
tab.r <- apply(tab.ars, 2, sum); tab.r
tab.s <- apply(tab.ars, 3, sum); tab.s
Exercise 3.4
For this syphilis data example, we’ll choose one 3-D array.
tab.ars <- xtabs(~ Age + Race + Sex, data = std89c)
rowt <- apply(tab.ars, c(1, 3), sum) # row distrib
rowd <- sweep(tab.ars, c(1, 3), rowt, "/"); rowd
apply(rowd, c(1, 3), sum) # confirm
colt <- apply(tab.ars, c(2, 3), sum) # col distrib
cold <- sweep(tab.ars, c(2, 3), colt, "/"); cold
apply(cold, c(2, 3), sum) # confirm
jtt <- apply(tab.ars, 3, sum) # joint distrib
jtd <- sweep(tab.ars, 3, jtt, "/"); jtd
apply(jtd, 3, sum) # confirm
distr <- list(rowd, cold, jtd); distr
Exercise 3.5
Exercise 3.6
Use the rep
function on the data frame rows to recreate the
individual-level data frame with over 40,000 observations.
It’s a good idea to understand how the rep
function works
with two vectors:
#> [1] 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3
We can see that the second vector determines the frequency of the first vector elements. Now use this understanding with the syphilis data.
std89b <-
read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89b.csv",
as.is = c(FALSE, FALSE, TRUE, FALSE))
str(std89b)
std89b
expand.rows <- rep(1:nrow(std89b),std89b$Freq)
std89b.df <- std89b[expand.rows,]
agelab<-c("<=14","15-19","20-24","25-29","30-34","35-44","45-54",">55")
std89b.df$Age <- factor(std89b.df$Age, levels = agelab)
str(std89b.df)
table(std89b.df$Age)
Exercise 3.7
Chapter 5
Exercise 5.1
replicates <- 5000
num_of_events <- 289 # recovered
num_at_risk <- 350
risk <- num_of_events/num_at_risk
sim_events <- rbinom(n = replicates, size = num_at_risk, prob = risk)
conf.level <- 0.95
alpha <- 1 - conf.level
## 0.95 CI for binomial counts
quantile(sim_events, c(alpha/2, 1-alpha/2))
## 0.95 CI for binomial proportions
quantile(sim_events, c(alpha/2, 1-alpha/2))/num_at_risk
Exercise 5.2
risk.boot <- function(x, n, conf.level = 0.95, replicates = 5000){
## x = number of events (numerator)
## n = number at risk (denominator)
## conf.level = confidence level (default 0.95)
## replicates = number of simulations
## prepare inputs
num_of_events <- x
num_at_risk <- n
## do calculations
risk <- num_of_events/num_at_risk
sim_events <- rbinom(n = replicates, size = num_at_risk, prob = risk)
conf.level <- conf.level
alpha <- 1 - conf.level
ci <- quantile(sim_events, c(alpha/2, 1-alpha/2))
## collect results
list(x = num_of_events,
n = num_at_risk,
conf.int = ci,
conf.level = conf.level)
}
## test
risk.boot(10, 20)
Exercise 5.3
myRR <- function(x, conf.level = 0.95){
## x is 2x2 table with this layout
## |Outcome| Exposed |
## | | Yes | No | Sum |
## |-------+-----+-----+-----|
## | Yes | a | b | M_1 |
## | No | c | d | M_0 |
## |-------+-----+-----|-----|
## | Sum | N_1 | N_0 | T |
##
## Prepare input
tab <- addmargins(x)
aa <- tab[1, 1]
bb <- tab[1, 2]
N1 <- tab['Sum',1]
N0 <- tab['Sum',2]
## Do calculations
R1 <- aa/N1
R0 <- bb/N0
RR <- R1/R0
Z <- qnorm((1 + conf.level)/2)
logRR <- log(RR)
SE_logRR <- sqrt(1/aa - 1/N1 + 1/bb - 1/N0)
CI <- exp(logRR + c(-1, 1) * Z * SE_logRR)
pv <- fisher.test(tab[1:2,1:2])$p.value
## Collect results
list(data = tab,
risks = prop.table(x, 2),
risk.ratio = RR,
conf.level = conf.level,
conf.int = CI,
p.value = pv)
}
## test function
tab <- matrix(c(273, 289, 77, 61), 2, 2, byrow = TRUE,
dimnames = list (Recovered = c('Yes', 'No'),
Treatment = c('Yes', 'No')))
myRR(tab)
Exercise 5.4
url <- "https://raw.githubusercontent.com/taragonmd/data/master/exported/jp-drugrx-p2.txt"
drugrx <- read.csv(url)
str(drugrx)
########## ALL ##########
tab_all <- xtabs(~ Recovered + Drug, data = drugrx)[2:1,2:1]
myRR(tab_all)
########## MEN ##########
tab_men <- xtabs(~ Recovered + Drug + Gender, data = drugrx)[2:1,2:1,'Men']
myRR(tab_men)
########## WOMEN ##########
tab_women <- xtabs(~ Recovered + Drug + Gender, data = drugrx)[2:1,2:1,'Women']
myRR(tab_women)
Exercise 5.5
Exercise 5.6
Exercise 5.7
Exercise 5.8
Exercise 5.9