Exercise solutions

Chapter 2

Exercise 2.2

tab <- matrix(c(139, 443, 230, 502), nrow = 2, ncol = 2,
        dimnames = list("Outcome" = c("Dead", "Alive"),
                        Smoker = c("Yes", "No")))
#### equivalent
tab <- matrix(c(139, 443, 230, 502), 2, 2)
dimnames(tab) <- list("Outcome" = c("Dead", "Alive"),
                  Smoker = c("Yes", "No"))
#### equivalent
tab <- matrix(c(139, 443, 230, 502), 2, 2)
rownames(tab) <- c("Dead", "Alive")
colnames(tab) <- c("Yes", "No")
names(dimnames(tab)) <- c("Outcome", "Smoker")

Exercise 2.3

Using the tab object from previous solution, study and practice the following R code to recreate Table 2.29.

rowt <- apply(tab, 1, sum)
tab2 <- cbind(tab, Total = rowt)
colt <- apply(tab2, 2, sum)
tab2 <- rbind(tab2, Total = colt)
names(dimnames(tab2)) <- c("Outcome", "Smoker")

Exercise 2.4

Study and execute the following R code:

rowt <- apply(tab, 1, sum)          # row distrib
rowd <- sweep(tab, 1, rowt, "/")
colt <- apply(tab, 2, sum)          # col distrib
cold <- sweep(tab, 2, colt, "/")
jtd <- tab/sum(tab)                 # joint distrib
ptabs <- list(row.distribution = rowd, col.distribution = cold, 
              joint.distribution = jtd)

Exercise 2.5

Using the tab2 object from previous solution, study and practice the following R code to recreate Table 2.30. Note that the column distributions could also have been used.

risk = tab2[1, 1:2]/tab2[3, 1:2]
risk.ratio <- risk/risk[2]
odds <- risk/(1 - risk)
odds.ratio <- odds/odds[2]
results <- rbind(risk, risk.ratio, odds, odds.ratio)
results.rounded <- round(results, 2)

Interpretation: The risk of death among non-smokers is higher than the risk of death among smokers, suggesting that there may be some confounding.

Exercise 2.6

wdat.oas <- xtabs(~outcome + age4 + smoker, data = wdat) 
wdat.tot.oas <- apply(wdat.oas, c(2, 3), sum)
wdat.risk.oas <- sweep(wdat.oas, c(2, 3), wdat.tot.oas, "/")
wdat.risk.oas2 <- round(wdat.risk.oas, 2)

Interpretation: The risk of death is not larger in non-smokers, in fact it is larger among smokers in older age groups.

Chapter 3

Exercise 3.1

Approach 1: do the arithmetic using R as a calculator with number counts from Table 3.14.

pR1.D0 <- (234+55)/(270+80)
pR1.D1 <- (81+192)/(87+263)
pR1.D0Gm <- 234/270
pR1.D1Gm <- 81/87
pR1.D0Gw <- 55/80
pR1.D1Gw <- 192/263

Approach 2: do the arithmetic using R objects.

jp2 <- 
read.csv('https://raw.githubusercontent.com/taragonmd/data/master/drugrx-pearl2.csv')
str(jp2)
tab2.rdg <- xtabs(~ Recovered + Drug + Gender, data = jp2)
tab2.rd <- apply(tab2.rdg, c(1,2), sum)
pR1.D0 <- tab2.rd['Yes','No']/sum(tab2.rd[,'No'])
pR1.D1 <- tab2.rd['Yes','Yes']/sum(tab2.rd[,'Yes'])
pR1.D0Gm <- tab2.rdg['Yes','No','Men']/sum(tab2.rdg[,'No','Men'])
pR1.D1Gm <- tab2.rdg['Yes','Yes','Men']/sum(tab2.rdg[,'Yes','Men'])
pR1.D0Gw <- tab2.rdg['Yes','No','Women']/sum(tab2.rdg[,'No','Women'])
pR1.D1Gw <- tab2.rdg['Yes','Yes','Women']/sum(tab2.rdg[,'Yes','Women'])

Approach 3: do the arithmetic using apply and sweep functions. Study and try this solution.

sweep(tab2.rd, 2, apply(tab2.rd,2,sum), "/")
sweep(tab2.rdg, 2:3, apply(tab2.rdg, 2:3,sum), "/")

Approach 4: do the arithmetic using prop.table functions. Study and try this solution.

prop.table(tab2.rd, 2)
prop.table(tab2.rdg, 2:3)

Exercise 3.2

Evaluate the structure of data frame.

std89c <- 
read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89c.csv")
str(std89c)
head(std89c)
lapply(std89c, table)

The Age variable levels are not ordered correctly. This is because R, by default, converts character vectors into factors with the levels in “alphabetical” order. One option is to set as.is=TRUE when using read.csv, and set the factor levels after reading in the data.

std89c<-
read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89c.csv",
as.is = c(FALSE, FALSE, TRUE))
str(std89c)
table(std89c$Age)
agelab<-c("<=14","15-19","20-24","25-29","30-34","35-44","45-54",">55")
std89c$Age <- factor(std89c$Age, levels = agelab)
table(std89c$Age)

Create 3-dimensional arrays using both the table and xtabs function, with and without attaching the data frame.

## Without attach
table(std89c$Race, std89c$Age, std89c$Sex)

#> , ,  = Female
#> 
#>        
#>         <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black  165   92  2257  4503  3590  2628  1505   392
#>   Other   11   15   158   307   283   167   149    40
#>   White   14   24   253   475   433   316   243    55
#> 
#> , ,  = Male
#> 
#>        
#>         <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black   31  823  1412  4059  4121  4453  3858  1619
#>   Other    7  108   210   654   633   520   492   202
#>   White    2  216    88   407   550   564   654   323

xtabs(~ Race + Age + Sex, data = std89c)

#> , , Sex = Female
#> 
#>        Age
#> Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black  165   92  2257  4503  3590  2628  1505   392
#>   Other   11   15   158   307   283   167   149    40
#>   White   14   24   253   475   433   316   243    55
#> 
#> , , Sex = Male
#> 
#>        Age
#> Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black   31  823  1412  4059  4121  4453  3858  1619
#>   Other    7  108   210   654   633   520   492   202
#>   White    2  216    88   407   550   564   654   323

## With attach and detach
attach(std89c)
table(Race, Age, Sex)

#> , , Sex = Female
#> 
#>        Age
#> Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black  165   92  2257  4503  3590  2628  1505   392
#>   Other   11   15   158   307   283   167   149    40
#>   White   14   24   253   475   433   316   243    55
#> 
#> , , Sex = Male
#> 
#>        Age
#> Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black   31  823  1412  4059  4121  4453  3858  1619
#>   Other    7  108   210   654   633   520   492   202
#>   White    2  216    88   407   550   564   654   323

xtabs(~ Race + Age + Sex)

#> , , Sex = Female
#> 
#>        Age
#> Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black  165   92  2257  4503  3590  2628  1505   392
#>   Other   11   15   158   307   283   167   149    40
#>   White   14   24   253   475   433   316   243    55
#> 
#> , , Sex = Male
#> 
#>        Age
#> Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
#>   Black   31  823  1412  4059  4121  4453  3858  1619
#>   Other    7  108   210   654   633   520   492   202
#>   White    2  216    88   407   550   564   654   323

detach(std89c)

Exercise 3.3

Here we use the apply function to get marginal totals for the syphilis 3-dimensional array.

tab.ars <- xtabs(~ Age + Race + Sex, data = std89c)
#### 2-D tables
tab.ar <- apply(tab.ars, c(1, 2), sum); tab.ar
tab.as <- apply(tab.ars, c(1, 3), sum); tab.as
tab.rs <- apply(tab.ars, c(2, 3), sum); tab.rs

#### 1-D tables
tab.a <- apply(tab.ars, 1, sum); tab.a
tab.r <- apply(tab.ars, 2, sum); tab.r
tab.s <- apply(tab.ars, 3, sum); tab.s

Exercise 3.4

For this syphilis data example, we’ll choose one 3-D array.

tab.ars <- xtabs(~ Age + Race + Sex, data = std89c)
rowt <- apply(tab.ars, c(1, 3), sum) # row distrib
rowd <- sweep(tab.ars, c(1, 3), rowt, "/"); rowd
apply(rowd, c(1, 3), sum) # confirm
colt <- apply(tab.ars, c(2, 3), sum) #  col distrib
cold <- sweep(tab.ars, c(2, 3), colt, "/"); cold
apply(cold, c(2, 3), sum) # confirm
jtt <- apply(tab.ars, 3, sum) # joint distrib
jtd <- sweep(tab.ars, 3, jtt, "/"); jtd
apply(jtd, 3, sum) # confirm
distr <- list(rowd, cold, jtd); distr

Exercise 3.5

table(std89c)
data.frame(table(std89c))

Exercise 3.6

Use the rep function on the data frame rows to recreate the individual-level data frame with over 40,000 observations.

It’s a good idea to understand how the rep function works with two vectors:

rep(c(1,2,3), c(4,5,6))

#>  [1] 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3

We can see that the second vector determines the frequency of the first vector elements. Now use this understanding with the syphilis data.

std89b <- 
read.csv("https://raw.githubusercontent.com/taragonmd/data/master/syphilis89b.csv",
as.is = c(FALSE, FALSE, TRUE, FALSE))
str(std89b)
std89b
expand.rows <- rep(1:nrow(std89b),std89b$Freq)
std89b.df <- std89b[expand.rows,]
agelab<-c("<=14","15-19","20-24","25-29","30-34","35-44","45-54",">55")
std89b.df$Age <- factor(std89b.df$Age, levels = agelab)
str(std89b.df)
table(std89b.df$Age)

Exercise 3.7

Chapter 5

Exercise 5.1

replicates <- 5000
num_of_events <- 289 # recovered
num_at_risk <- 350
risk <- num_of_events/num_at_risk
sim_events <- rbinom(n = replicates, size = num_at_risk, prob = risk)
conf.level <- 0.95
alpha <- 1 - conf.level
## 0.95 CI for binomial counts
quantile(sim_events, c(alpha/2, 1-alpha/2))
## 0.95 CI for binomial proportions
quantile(sim_events, c(alpha/2, 1-alpha/2))/num_at_risk

Exercise 5.2

risk.boot <- function(x, n, conf.level = 0.95, replicates = 5000){
    ## x = number of events (numerator)
    ## n = number at risk (denominator)
    ## conf.level = confidence level (default 0.95)
    ## replicates = number of simulations
    ## prepare inputs
    num_of_events <- x
    num_at_risk <- n
    ## do calculations
    risk <- num_of_events/num_at_risk
    sim_events <- rbinom(n = replicates, size = num_at_risk, prob = risk)
    conf.level <- conf.level
    alpha <- 1 - conf.level
    ci <- quantile(sim_events, c(alpha/2, 1-alpha/2))
    ## collect results
    list(x = num_of_events,
         n = num_at_risk,
         conf.int = ci,
         conf.level = conf.level)
}

## test
risk.boot(10, 20)

Exercise 5.3

myRR <- function(x, conf.level = 0.95){
    ## x is 2x2 table with this layout
    ## |Outcome| Exposed   |
    ## |       | Yes | No  | Sum |
    ## |-------+-----+-----+-----|
    ## |   Yes |  a  |  b  | M_1 |
    ## |    No |  c  |  d  | M_0 |
    ## |-------+-----+-----|-----|
    ## |   Sum | N_1 | N_0 |  T  |
    ##
    ## Prepare input
    tab <- addmargins(x)
    aa <- tab[1, 1]
    bb <- tab[1, 2]
    N1 <- tab['Sum',1]
    N0 <- tab['Sum',2]
    
    ## Do calculations
    R1 <- aa/N1
    R0 <- bb/N0
    RR <- R1/R0
    Z <- qnorm((1 + conf.level)/2)
    logRR <- log(RR)
    SE_logRR <- sqrt(1/aa - 1/N1 + 1/bb - 1/N0)
    CI <- exp(logRR + c(-1, 1) * Z * SE_logRR)
    pv <- fisher.test(tab[1:2,1:2])$p.value

    ## Collect results
    list(data = tab,
         risks = prop.table(x, 2),
         risk.ratio = RR,
         conf.level = conf.level,
         conf.int = CI,
         p.value = pv)
}

## test function
tab <- matrix(c(273, 289, 77, 61), 2, 2, byrow = TRUE,
              dimnames = list (Recovered = c('Yes', 'No'),
                               Treatment = c('Yes', 'No')))
myRR(tab)

Exercise 5.4

url <- "https://raw.githubusercontent.com/taragonmd/data/master/exported/jp-drugrx-p2.txt"
drugrx <- read.csv(url)
str(drugrx)
########## ALL ##########
tab_all <- xtabs(~ Recovered + Drug, data = drugrx)[2:1,2:1]
myRR(tab_all)
########## MEN ##########
tab_men <- xtabs(~ Recovered + Drug + Gender, data = drugrx)[2:1,2:1,'Men']
myRR(tab_men)
########## WOMEN ##########
tab_women <- xtabs(~ Recovered + Drug + Gender, data = drugrx)[2:1,2:1,'Women']
myRR(tab_women)

Exercise 5.5

Exercise 5.6

Exercise 5.7

Exercise 5.8

Exercise 5.9