# Exercise solutions

## Chapter 2

Exercise 2.2

tab <- matrix(c(139, 443, 230, 502), nrow = 2, ncol = 2,
dimnames = list("Outcome" = c("Dead", "Alive"),
Smoker = c("Yes", "No")))
#### equivalent
tab <- matrix(c(139, 443, 230, 502), 2, 2)
dimnames(tab) <- list("Outcome" = c("Dead", "Alive"),
Smoker = c("Yes", "No"))
#### equivalent
tab <- matrix(c(139, 443, 230, 502), 2, 2)
colnames(tab) <- c("Yes", "No")
names(dimnames(tab)) <- c("Outcome", "Smoker")

Exercise 2.3

Using the tab object from previous solution, study and practice the following R code to recreate Table 2.29.

rowt <- apply(tab, 1, sum)
tab2 <- cbind(tab, Total = rowt)
colt <- apply(tab2, 2, sum)
tab2 <- rbind(tab2, Total = colt)
names(dimnames(tab2)) <- c("Outcome", "Smoker")

Exercise 2.4

Study and execute the following R code:

rowt <- apply(tab, 1, sum)          # row distrib
rowd <- sweep(tab, 1, rowt, "/")
colt <- apply(tab, 2, sum)          # col distrib
cold <- sweep(tab, 2, colt, "/")
jtd <- tab/sum(tab)                 # joint distrib
ptabs <- list(row.distribution = rowd, col.distribution = cold,
joint.distribution = jtd)

Exercise 2.5

Using the tab2 object from previous solution, study and practice the following R code to recreate Table 2.30. Note that the column distributions could also have been used.

risk = tab2[1, 1:2]/tab2[3, 1:2]
risk.ratio <- risk/risk[2]
odds <- risk/(1 - risk)
odds.ratio <- odds/odds[2]
results <- rbind(risk, risk.ratio, odds, odds.ratio)
results.rounded <- round(results, 2)

Interpretation: The risk of death among non-smokers is higher than the risk of death among smokers, suggesting that there may be some confounding.

Exercise 2.6

wdat.oas <- xtabs(~outcome + age4 + smoker, data = wdat)
wdat.tot.oas <- apply(wdat.oas, c(2, 3), sum)
wdat.risk.oas <- sweep(wdat.oas, c(2, 3), wdat.tot.oas, "/")
wdat.risk.oas2 <- round(wdat.risk.oas, 2)

Interpretation: The risk of death is not larger in non-smokers, in fact it is larger among smokers in older age groups.

## Chapter 3

Exercise 3.1

Approach 1: do the arithmetic using R as a calculator with number counts from Table 3.14.

pR1.D0 <- (234+55)/(270+80)
pR1.D1 <- (81+192)/(87+263)
pR1.D0Gm <- 234/270
pR1.D1Gm <- 81/87
pR1.D0Gw <- 55/80
pR1.D1Gw <- 192/263

Approach 2: do the arithmetic using R objects.

jp2 <-
str(jp2)
tab2.rdg <- xtabs(~ Recovered + Drug + Gender, data = jp2)
tab2.rd <- apply(tab2.rdg, c(1,2), sum)
pR1.D0 <- tab2.rd['Yes','No']/sum(tab2.rd[,'No'])
pR1.D1 <- tab2.rd['Yes','Yes']/sum(tab2.rd[,'Yes'])
pR1.D0Gm <- tab2.rdg['Yes','No','Men']/sum(tab2.rdg[,'No','Men'])
pR1.D1Gm <- tab2.rdg['Yes','Yes','Men']/sum(tab2.rdg[,'Yes','Men'])
pR1.D0Gw <- tab2.rdg['Yes','No','Women']/sum(tab2.rdg[,'No','Women'])
pR1.D1Gw <- tab2.rdg['Yes','Yes','Women']/sum(tab2.rdg[,'Yes','Women'])

Approach 3: do the arithmetic using apply and sweep functions. Study and try this solution.

sweep(tab2.rd, 2, apply(tab2.rd,2,sum), "/")
sweep(tab2.rdg, 2:3, apply(tab2.rdg, 2:3,sum), "/")

Approach 4: do the arithmetic using prop.table functions. Study and try this solution.

prop.table(tab2.rd, 2)
prop.table(tab2.rdg, 2:3)

Exercise 3.2

1. Evaluate the structure of data frame.
std89c <-
str(std89c)
lapply(std89c, table)
1. The Age variable levels are not ordered correctly. This is because R, by default, converts character vectors into factors with the levels in “alphabetical” order. One option is to set as.is=TRUE when using read.csv, and set the factor levels after reading in the data.
std89c<-
as.is = c(FALSE, FALSE, TRUE))
str(std89c)
table(std89c$Age) agelab<-c("<=14","15-19","20-24","25-29","30-34","35-44","45-54",">55") std89c$Age <- factor(std89c$Age, levels = agelab) table(std89c$Age)
1. Create 3-dimensional arrays using both the table and xtabs function, with and without attaching the data frame.
## Without attach
table(std89c$Race, std89c$Age, std89c$Sex) #> , , = Female #> #> #> <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 165 92 2257 4503 3590 2628 1505 392 #> Other 11 15 158 307 283 167 149 40 #> White 14 24 253 475 433 316 243 55 #> #> , , = Male #> #> #> <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 31 823 1412 4059 4121 4453 3858 1619 #> Other 7 108 210 654 633 520 492 202 #> White 2 216 88 407 550 564 654 323 xtabs(~ Race + Age + Sex, data = std89c) #> , , Sex = Female #> #> Age #> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 165 92 2257 4503 3590 2628 1505 392 #> Other 11 15 158 307 283 167 149 40 #> White 14 24 253 475 433 316 243 55 #> #> , , Sex = Male #> #> Age #> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 31 823 1412 4059 4121 4453 3858 1619 #> Other 7 108 210 654 633 520 492 202 #> White 2 216 88 407 550 564 654 323 ## With attach and detach attach(std89c) table(Race, Age, Sex) #> , , Sex = Female #> #> Age #> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 165 92 2257 4503 3590 2628 1505 392 #> Other 11 15 158 307 283 167 149 40 #> White 14 24 253 475 433 316 243 55 #> #> , , Sex = Male #> #> Age #> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 31 823 1412 4059 4121 4453 3858 1619 #> Other 7 108 210 654 633 520 492 202 #> White 2 216 88 407 550 564 654 323 xtabs(~ Race + Age + Sex) #> , , Sex = Female #> #> Age #> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 165 92 2257 4503 3590 2628 1505 392 #> Other 11 15 158 307 283 167 149 40 #> White 14 24 253 475 433 316 243 55 #> #> , , Sex = Male #> #> Age #> Race <=14 >55 15-19 20-24 25-29 30-34 35-44 45-54 #> Black 31 823 1412 4059 4121 4453 3858 1619 #> Other 7 108 210 654 633 520 492 202 #> White 2 216 88 407 550 564 654 323 detach(std89c) Exercise 3.3 Here we use the apply function to get marginal totals for the syphilis 3-dimensional array. tab.ars <- xtabs(~ Age + Race + Sex, data = std89c) #### 2-D tables tab.ar <- apply(tab.ars, c(1, 2), sum); tab.ar tab.as <- apply(tab.ars, c(1, 3), sum); tab.as tab.rs <- apply(tab.ars, c(2, 3), sum); tab.rs #### 1-D tables tab.a <- apply(tab.ars, 1, sum); tab.a tab.r <- apply(tab.ars, 2, sum); tab.r tab.s <- apply(tab.ars, 3, sum); tab.s Exercise 3.4 For this syphilis data example, we’ll choose one 3-D array. tab.ars <- xtabs(~ Age + Race + Sex, data = std89c) rowt <- apply(tab.ars, c(1, 3), sum) # row distrib rowd <- sweep(tab.ars, c(1, 3), rowt, "/"); rowd apply(rowd, c(1, 3), sum) # confirm colt <- apply(tab.ars, c(2, 3), sum) # col distrib cold <- sweep(tab.ars, c(2, 3), colt, "/"); cold apply(cold, c(2, 3), sum) # confirm jtt <- apply(tab.ars, 3, sum) # joint distrib jtd <- sweep(tab.ars, 3, jtt, "/"); jtd apply(jtd, 3, sum) # confirm distr <- list(rowd, cold, jtd); distr  Exercise 3.5 table(std89c) data.frame(table(std89c)) Exercise 3.6 Use the rep function on the data frame rows to recreate the individual-level data frame with over 40,000 observations. It’s a good idea to understand how the rep function works with two vectors: rep(c(1,2,3), c(4,5,6)) #> [1] 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3 We can see that the second vector determines the frequency of the first vector elements. Now use this understanding with the syphilis data. std89b <- read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89b.csv", as.is = c(FALSE, FALSE, TRUE, FALSE)) str(std89b) std89b expand.rows <- rep(1:nrow(std89b),std89b$Freq)
std89b.df <- std89b[expand.rows,]
agelab<-c("<=14","15-19","20-24","25-29","30-34","35-44","45-54",">55")
std89b.df$Age <- factor(std89b.df$Age, levels = agelab)
str(std89b.df)
table(std89b.df$Age) Exercise 3.7 ## Chapter 5 Exercise 5.1 replicates <- 5000 num_of_events <- 289 # recovered num_at_risk <- 350 risk <- num_of_events/num_at_risk sim_events <- rbinom(n = replicates, size = num_at_risk, prob = risk) conf.level <- 0.95 alpha <- 1 - conf.level ## 0.95 CI for binomial counts quantile(sim_events, c(alpha/2, 1-alpha/2)) ## 0.95 CI for binomial proportions quantile(sim_events, c(alpha/2, 1-alpha/2))/num_at_risk Exercise 5.2 risk.boot <- function(x, n, conf.level = 0.95, replicates = 5000){ ## x = number of events (numerator) ## n = number at risk (denominator) ## conf.level = confidence level (default 0.95) ## replicates = number of simulations ## prepare inputs num_of_events <- x num_at_risk <- n ## do calculations risk <- num_of_events/num_at_risk sim_events <- rbinom(n = replicates, size = num_at_risk, prob = risk) conf.level <- conf.level alpha <- 1 - conf.level ci <- quantile(sim_events, c(alpha/2, 1-alpha/2)) ## collect results list(x = num_of_events, n = num_at_risk, conf.int = ci, conf.level = conf.level) } ## test risk.boot(10, 20) Exercise 5.3 myRR <- function(x, conf.level = 0.95){ ## x is 2x2 table with this layout ## |Outcome| Exposed | ## | | Yes | No | Sum | ## |-------+-----+-----+-----| ## | Yes | a | b | M_1 | ## | No | c | d | M_0 | ## |-------+-----+-----|-----| ## | Sum | N_1 | N_0 | T | ## ## Prepare input tab <- addmargins(x) aa <- tab[1, 1] bb <- tab[1, 2] N1 <- tab['Sum',1] N0 <- tab['Sum',2] ## Do calculations R1 <- aa/N1 R0 <- bb/N0 RR <- R1/R0 Z <- qnorm((1 + conf.level)/2) logRR <- log(RR) SE_logRR <- sqrt(1/aa - 1/N1 + 1/bb - 1/N0) CI <- exp(logRR + c(-1, 1) * Z * SE_logRR) pv <- fisher.test(tab[1:2,1:2])$p.value

## Collect results
list(data = tab,
risks = prop.table(x, 2),
risk.ratio = RR,
conf.level = conf.level,
conf.int = CI,
p.value = pv)
}

## test function
tab <- matrix(c(273, 289, 77, 61), 2, 2, byrow = TRUE,
dimnames = list (Recovered = c('Yes', 'No'),
Treatment = c('Yes', 'No')))
myRR(tab)

Exercise 5.4

url <- "https://raw.githubusercontent.com/taragonmd/data/master/exported/jp-drugrx-p2.txt"
str(drugrx)
########## ALL ##########
tab_all <- xtabs(~ Recovered + Drug, data = drugrx)[2:1,2:1]
myRR(tab_all)
########## MEN ##########
tab_men <- xtabs(~ Recovered + Drug + Gender, data = drugrx)[2:1,2:1,'Men']
myRR(tab_men)
########## WOMEN ##########
tab_women <- xtabs(~ Recovered + Drug + Gender, data = drugrx)[2:1,2:1,'Women']
myRR(tab_women)

Exercise 5.5

Exercise 5.6

Exercise 5.7

Exercise 5.8

Exercise 5.9