7.3 Binomial Regression

Binomial

Here, cancer case = successes, and control case = failures.

data("esoph")
head(esoph, n = 3)
##   agegp     alcgp    tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day      0        40
## 2 25-34 0-39g/day    10-19      0        10
## 3 25-34 0-39g/day    20-29      0         6
plot(
  esoph$ncases / (esoph$ncases + esoph$ncontrols) ~ esoph$alcgp,
  ylab = "Proportion",
  xlab = 'Alcohol consumption',
  main = 'Esophageal Cancer data'
)

class(esoph$agegp) <- "factor"
class(esoph$alcgp) <- "factor"
class(esoph$tobgp) <- "factor"
#  only the alcohol consumption as a predictor
model <- glm(cbind(ncases, ncontrols) ~ alcgp, data = esoph, family = binomial)
summary(model)
## 
## Call:
## glm(formula = cbind(ncases, ncontrols) ~ alcgp, family = binomial, 
##     data = esoph)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6629  -1.0478  -0.0081   0.6307   3.0296  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.6610     0.1921 -13.854  < 2e-16 ***
## alcgp40-79    1.1064     0.2303   4.804 1.56e-06 ***
## alcgp80-119   1.6656     0.2525   6.597 4.20e-11 ***
## alcgp120+     2.2630     0.2721   8.317  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 227.24  on 87  degrees of freedom
## Residual deviance: 138.79  on 84  degrees of freedom
## AIC: 294.27
## 
## Number of Fisher Scoring iterations: 5
#Coefficient Odds
coefficients(model) %>% exp
## (Intercept)  alcgp40-79 alcgp80-119   alcgp120+ 
##  0.06987952  3.02331229  5.28860570  9.61142563
deviance(model)/df.residual(model)
## [1] 1.652253
model$aic
## [1] 294.27
# alcohol consumption and age as predictors
better_model <- glm(cbind(ncases, ncontrols) ~ agegp + alcgp, data = esoph, family = binomial)
summary(better_model)
## 
## Call:
## glm(formula = cbind(ncases, ncontrols) ~ agegp + alcgp, family = binomial, 
##     data = esoph)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8979  -0.5592  -0.1995   0.5029   2.6250  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.6180     1.0217  -5.499 3.82e-08 ***
## agegp35-44    1.5376     1.0646   1.444 0.148669    
## agegp45-54    2.9470     1.0217   2.884 0.003922 ** 
## agegp55-64    3.3116     1.0172   3.255 0.001132 ** 
## agegp65-74    3.5774     1.0209   3.504 0.000458 ***
## agegp75+      3.5858     1.0620   3.377 0.000734 ***
## alcgp40-79    1.1392     0.2367   4.814 1.48e-06 ***
## alcgp80-119   1.4951     0.2600   5.749 8.97e-09 ***
## alcgp120+     2.2228     0.2843   7.820 5.29e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 227.241  on 87  degrees of freedom
## Residual deviance:  64.572  on 79  degrees of freedom
## AIC: 230.05
## 
## Number of Fisher Scoring iterations: 6
better_model$aic #smaller AIC is better
## [1] 230.0526
coefficients(better_model) %>% exp
##  (Intercept)   agegp35-44   agegp45-54   agegp55-64   agegp65-74     agegp75+ 
##  0.003631855  4.653273722 19.047899816 27.428640745 35.780787582 36.082010052 
##   alcgp40-79  alcgp80-119    alcgp120+ 
##  3.124334222  4.459579378  9.233256747
pchisq(
    q = model$deviance - better_model$deviance,
    df = model$df.residual - better_model$df.residual,
    lower = FALSE
)
## [1] 1.354906e-14
# specify link function as probit
Prob_better_model <- glm(
    cbind(ncases, ncontrols) ~ agegp + alcgp,
    data = esoph,
    family = binomial(link = probit)
)
summary(Prob_better_model)
## 
## Call:
## glm(formula = cbind(ncases, ncontrols) ~ agegp + alcgp, family = binomial(link = probit), 
##     data = esoph)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8676  -0.5938  -0.1802   0.4852   2.6056  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.9800     0.4291  -6.945 3.79e-12 ***
## agegp35-44    0.6991     0.4491   1.557 0.119520    
## agegp45-54    1.4212     0.4292   3.311 0.000929 ***
## agegp55-64    1.6512     0.4262   3.874 0.000107 ***
## agegp65-74    1.8039     0.4297   4.198 2.69e-05 ***
## agegp75+      1.8025     0.4613   3.908 9.32e-05 ***
## alcgp40-79    0.6224     0.1247   4.990 6.03e-07 ***
## alcgp80-119   0.8256     0.1418   5.823 5.80e-09 ***
## alcgp120+     1.2839     0.1596   8.043 8.77e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 227.241  on 87  degrees of freedom
## Residual deviance:  61.938  on 79  degrees of freedom
## AIC: 227.42
## 
## Number of Fisher Scoring iterations: 6