20.2 Stratified Sampling

A stratum is a subset of the population that has at least one common characteristic.

Steps:

  1. Identify relevant stratums and their representation in the population.
  2. Randomly sample to select a sufficient number of subjects from each stratum.

Stratified sampling reduces sampling error.

library(dplyr)
# by number of rows
sample_iris <- iris %>%
    group_by(Species) %>%
    sample_n(5)
sample_iris
#> # A tibble: 15 × 5
#> # Groups:   Species [3]
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>     
#>  1          4.4         3            1.3         0.2 setosa    
#>  2          5.2         3.5          1.5         0.2 setosa    
#>  3          5.1         3.8          1.5         0.3 setosa    
#>  4          5.2         3.4          1.4         0.2 setosa    
#>  5          4.5         2.3          1.3         0.3 setosa    
#>  6          5.5         2.5          4           1.3 versicolor
#>  7          7           3.2          4.7         1.4 versicolor
#>  8          6.7         3            5           1.7 versicolor
#>  9          6.1         2.9          4.7         1.4 versicolor
#> 10          5.5         2.4          3.8         1.1 versicolor
#> 11          6.4         2.7          5.3         1.9 virginica 
#> 12          6.4         2.8          5.6         2.1 virginica 
#> 13          6.4         3.2          5.3         2.3 virginica 
#> 14          6.8         3.2          5.9         2.3 virginica 
#> 15          7.2         3.6          6.1         2.5 virginica

# by fraction
sample_iris <- iris %>%
    group_by(Species) %>%
    sample_frac(size = .15)
sample_iris
#> # A tibble: 24 × 5
#> # Groups:   Species [3]
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
#>           <dbl>       <dbl>        <dbl>       <dbl> <fct>     
#>  1          5.5         4.2          1.4         0.2 setosa    
#>  2          5           3            1.6         0.2 setosa    
#>  3          5.2         4.1          1.5         0.1 setosa    
#>  4          4.6         3.1          1.5         0.2 setosa    
#>  5          5.1         3.7          1.5         0.4 setosa    
#>  6          4.8         3.4          1.9         0.2 setosa    
#>  7          5.1         3.3          1.7         0.5 setosa    
#>  8          5.5         3.5          1.3         0.2 setosa    
#>  9          5           2.3          3.3         1   versicolor
#> 10          5.6         2.9          3.6         1.3 versicolor
#> # ℹ 14 more rows
library(sampler)
# Stratified sample using proportional allocation without replacement
ssamp(df=albania, n=360, strata=qarku, over=0.1)
#> # A tibble: 395 × 45
#>    qarku  Q_ID bashkia   BAS_ID zaz   njesiaAdministrative COM_ID qvKod zgjedhes
#>    <fct> <int> <fct>      <int> <fct> <fct>                 <int> <fct>    <int>
#>  1 Berat     1 Berat         11 ZAZ … "Berat "               1101 "\"3…      558
#>  2 Berat     1 Berat         11 ZAZ … "Berat "               1101 "\"3…      815
#>  3 Berat     1 Berat         11 ZAZ … "Sinje"                1108 "\"3…      419
#>  4 Berat     1 Kucove        13 ZAZ … "Lumas"                1104 "\"3…      237
#>  5 Berat     1 Kucove        13 ZAZ … "Kucove"               1201 "\"3…      562
#>  6 Berat     1 Skrapar       17 ZAZ … "Corovode"             1303 "\"3…      829
#>  7 Berat     1 Berat         11 ZAZ … "Roshnik"              1107 "\"3…      410
#>  8 Berat     1 Ura Vajg…     19 ZAZ … "Ura Vajgurore"        1110 "\"3…      708
#>  9 Berat     1 Kucove        13 ZAZ … "Perondi"              1203 "\"3…      835
#> 10 Berat     1 Kucove        13 ZAZ … "Kucove"               1201 "\"3…      907
#> # ℹ 385 more rows
#> # ℹ 36 more variables: meshkuj <int>, femra <int>, totalSeats <int>,
#> #   vendndodhja <fct>, ambienti <fct>, totalVoters <int>, femVoters <int>,
#> #   maleVoters <int>, unusedBallots <int>, damagedBallots <int>,
#> #   ballotsCast <int>, invalidVotes <int>, validVotes <int>, lsi <int>,
#> #   ps <int>, pkd <int>, sfida <int>, pr <int>, pd <int>, pbdksh <int>,
#> #   adk <int>, psd <int>, ad <int>, frd <int>, pds <int>, pdiu <int>, …

Identify number of missing points by strata between sample and collected data

alsample <- rsamp(df = albania, 544)
alreceived <- rsamp(df = alsample, 390)
smissing(
    sampdf = alsample,
    colldf = alreceived,
    strata = qarku,
    col_name = qvKod
)