25.2 Krippendorff’s Alpha

calculates disagreement among raters.

$\frac{\text{observed disagreement}}{\text{expected disagreement}}$

Rules of thumb by (Shelley and Krippendorff 1984)

$\alpha \ge 0.08$ = good
$\alpha \ge 0.667$ = acceptable lower limit

nmm <-
    matrix(
        c(
            1,
            1,
            NA,
            1,
            2,
            2,
            3,
            2,
            3,
            3,
            3,
            3,
            3,
            3,
            3,
            3,
            2,
            2,
            2,
            2,
            1,
            2,
            3,
            4,
            4,
            4,
            4,
            4,
            1,
            1,
            2,
            1,
            2,
            2,
            2,
            2,
            NA,
            5,
            5,
            5,
            NA,
            NA,
            1,
            1,
            NA,
            NA,
            3,
            NA
        ),
nrow = 4
)
 # first assume the default nominal classification
 kripp.alpha(nmm)

##  Krippendorff's alpha
## 
##  Subjects = 12 
##    Raters = 4 
##     alpha = 0.743

 # now use the same data with the other three methods
 kripp.alpha(nmm, "ordinal")

##  Krippendorff's alpha
## 
##  Subjects = 12 
##    Raters = 4 
##     alpha = 0.815

 kripp.alpha(nmm, "interval")

##  Krippendorff's alpha
## 
##  Subjects = 12 
##    Raters = 4 
##     alpha = 0.849

 kripp.alpha(nmm, "ratio")

##  Krippendorff's alpha
## 
##  Subjects = 12 
##    Raters = 4 
##     alpha = 0.797

25.2.1 Kendall’s W

Continuous ordinal ratings

rtr1 <- c(1, 6, 3, 2, 5, 4)
rtr2 <- c(1, 5, 6, 2, 4, 3)
rtr3 <- c(2, 3, 6, 5, 4, 1)
ratings <- cbind(rtr1, rtr2, rtr3)
library(DescTools)
KendallW(ratings, test=TRUE)

## 
##  Kendall's coefficient of concordance W
## 
## data:  ratings
## Kendall chi-squared = 8.5238, df = 5, subjects = 6, raters = 3, p-value
## = 0.1296
## alternative hypothesis: W is greater 0
## sample estimates:
##        W 
## 0.568254

25.2.2 Intraclass correlation coefficients

(Shrout and Fleiss 1979)

25.2.2.1 Continuous scales

Decision:

model:
- "oneway" when subjects are random effects
- "twoway" when subjects and raters are random effects
type:
- "agreement" differences in mean ratings among raters are of interest
- "consistency" is default
unit
- "single" single value
- "average" mean of several ratings

icc(anxiety,
    model = "twoway", # can be "oneway"
    type = "agreement", # can be "consistency"
    unit = "single" # can be "average"
    )

##  Single Score Intraclass Correlation
## 
##    Model: twoway 
##    Type : agreement 
## 
##    Subjects = 20 
##      Raters = 3 
##    ICC(A,1) = 0.198
## 
##  F-Test, H0: r0 = 0 ; H1: r0 > 0 
##  F(19,39.7) = 1.83 , p = 0.0543 
## 
##  95%-Confidence Interval for ICC Population Values:
##   -0.039 < ICC < 0.494

DescTools package

rtr1 <- c(9, 6, 8, 7, 10, 6)
rtr2 <- c(2, 1, 4, 1,  5, 2)
rtr3 <- c(5, 3, 6, 2,  6, 4)
rtr4 <- c(8, 2, 8, 6,  9, 7)
ratings <- cbind(rtr1, rtr2, rtr3, rtr4)
DescTools::ICC(ratings)

## 
## Intraclass correlation coefficients 
##                          type   est F-val df1 df2    p-val lwr.ci upr.ci
## Single_raters_absolute   ICC1 0.166  1.79   5  18 0.164769     NA     NA
## Single_random_raters     ICC2 0.290 11.03   5  15 0.000135     NA     NA
## Single_fixed_raters      ICC3 0.715 11.03   5  15 0.000135     NA     NA
## Average_raters_absolute ICC1k 0.443  1.79   5  18 0.164769     NA     NA
## Average_random_raters   ICC2k 0.620 11.03   5  15 0.000135     NA     NA
## Average_fixed_raters    ICC3k 0.909 11.03   5  15 0.000135     NA     NA
## 
##  Number of subjects = 6     Number of raters = 4

25.2.3 Light’s kappa

average of Cohen’s Kappa if using more than two categorical variables

# Light’s kappa: multiple raters
kappam.light(diagnoses[, 1:3])

## Warning in sqrt(varkappa): NaNs produced

##  Light's Kappa for m Raters
## 
##  Subjects = 30 
##    Raters = 3 
##     Kappa = 0.555 
## 
##         z = NaN 
##   p-value = NaN

References

Shelley, Mack, and Klaus Krippendorff. 1984. “Content Analysis: An Introduction to Its Methodology.” Journal of the American Statistical Association 79 (385): 240. https://doi.org/10.2307/2288384.

Shrout, Patrick E., and Joseph L. Fleiss. 1979. “Intraclass Correlations: Uses in Assessing Rater Reliability.” Psychological Bulletin 86 (2): 420–28. https://doi.org/10.1037/0033-2909.86.2.420.