5.3 Chaninge number of levels

5.3.1 Lumping levels

To demonstrate how to lump multiple levels of a factor, we will start with fct_count() to count factor levels. It’s basically a variant of dplyr::count(), taking a factor (factor) as its first argument instead of a data frame, which makes it a nice function in mutate().

fct_count(starwars$skin_color, 
          sort = TRUE,  # sort descendantly
          prop = TRUE)  # compute the fraction of marginal table
#> # A tibble: 31 x 3
#>   f         n      p
#>   <fct> <int>  <dbl>
#> 1 fair     17 0.195 
#> 2 light    11 0.126 
#> 3 dark      6 0.0690
#> 4 green     6 0.0690
#> 5 grey      6 0.0690
#> 6 pale      5 0.0575
#> # ... with 25 more rows

skin_color has 31 levels overall, and the top 5 to 6 levels occupy more than 50% percent of occurence. In fact, there are 24 levels whose frequency is less than 3%.

fct_count(starwars$skin_color, prop = TRUE) %>%
  filter(p < 0.03)
#> # A tibble: 24 x 3
#>   f                       n      p
#>   <fct>               <int>  <dbl>
#> 1 blue                    2 0.0230
#> 2 blue, grey              2 0.0230
#> 3 brown mottle            1 0.0115
#> 4 brown, white            1 0.0115
#> 5 fair, green, yellow     1 0.0115
#> 6 gold                    1 0.0115
#> # ... with 18 more rows

In this case, We may want to collpase some of the less frequent levels into one, say, a level called “other”.

forcats provides a family of functions that lumps together factor levels that meet some criteria into a new level “other”.

fct_lump_min(): lumps levels that appear fewer than min times
fct_lump_prop(): lumps levels that appear fewer than prop * n times
fct_lump_n(): lumps all levels except for the n most frequent (or least frequent if n < 0)
fct_lump_lowfreq() lumps together the least frequent levels, ensuring that “other” is still the smallest level.

# lump levels that appear fewer than 5 times into "other"
starwars %>% 
  mutate(skin_color = fct_lump_min(skin_color, min = 5)) %>% 
  count(skin_color, sort = TRUE)
#> # A tibble: 7 x 2
#>   skin_color     n
#>   <fct>      <int>
#> 1 Other         36
#> 2 fair          17
#> 3 light         11
#> 4 dark           6
#> 5 green          6
#> 6 grey           6
#> # ... with 1 more row

# preserve 5 most common levels
starwars %>%
  mutate(skin_color = fct_lump_n(skin_color, n = 5)) %>%
  count(skin_color, sort = TRUE)
#> # A tibble: 6 x 2
#>   skin_color     n
#>   <fct>      <int>
#> 1 Other         41
#> 2 fair          17
#> 3 light         11
#> 4 dark           6
#> 5 green          6
#> 6 grey           6

# preserve 5 least common levels
starwars %>%
  mutate(skin_color = fct_lump(skin_color, n = -5)) %>%
  count(skin_color, sort = TRUE)
#> # A tibble: 17 x 2
#>   skin_color              n
#>   <fct>               <int>
#> 1 Other                  71
#> 2 brown mottle            1
#> 3 brown, white            1
#> 4 fair, green, yellow     1
#> 5 gold                    1
#> 6 green-tan, brown        1
#> # ... with 11 more rows

Similarly, positive prop preserves values that appear at least prop of the time. Negative prop preserves values that appear at most -prop of the time.

Use argument other_level to change default name “other”

starwars %>%
  mutate(skin_color = fct_lump_prop(skin_color, 
                                     prop = 0.1, 
                                     other_level = "extra")) %>%
  count(skin_color, sort = TRUE)
#> # A tibble: 3 x 2
#>   skin_color     n
#>   <fct>      <int>
#> 1 extra         59
#> 2 fair          17
#> 3 light         11

fct_other(f, keep, drop, other_level) provides a way of manually replacing values with “other”. Pcik one of keep and drop:

keep will preserve listed levels, replacing all others with other_level
drop will replace listed levels with other_level, keeping all as is.

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))

fct_other(x, 
          keep = c("A", "B"), 
          other_level = "I don't care") %>% 
  fct_count()
#> # A tibble: 3 x 2
#>   f                n
#>   <fct>        <int>
#> 1 A               40
#> 2 B               10
#> 3 I don't care    37

5.3.2 Expanding levels

fct_expand() add additional levels to a factor

f <- factor(sample(letters[1:3], 20 , replace = T))
fct_count(f)
#> # A tibble: 3 x 2
#>   f         n
#>   <fct> <int>
#> 1 a         8
#> 2 b         9
#> 3 c         3

# add 3 values
f <- fct_expand(f, "d", "e", "f")
levels(f)
#> [1] "a" "b" "c" "d" "e" "f"
# additional levels are assigned with zero frequncy
fct_count(f)
#> # A tibble: 6 x 2
#>   f         n
#>   <fct> <int>
#> 1 a         8
#> 2 b         9
#> 3 c         3
#> 4 d         0
#> 5 e         0
#> 6 f         0

fct_cross() combines levels of multiple input factors in a parallel manner:

fruit <- factor(c("apple", "kiwi", "apple", "apple"))
color <- factor(c("green", "green", "red", "green"))

fct_cross(fruit, color)
#> [1] apple:green kiwi:green  apple:red   apple:green
#> Levels: apple:green kiwi:green apple:red
# change deliminator
fct_cross(fruit, color, sep = "|")
#> [1] apple|green kiwi|green  apple|red   apple|green
#> Levels: apple|green kiwi|green apple|red

By default, fct_cross() does not regard combinations with no observations as valid levels, so kiwi:red didn’t appear in the output. Use keep_empty = TRUE so that fct_croos() keep combinations with no observations as levels

fct_cross(fruit, color, keep_empty = TRUE)
#> [1] apple:green kiwi:green  apple:red   apple:green
#> Levels: apple:green kiwi:green apple:red kiwi:red

5.3.3 Dropping levels

有时候我们希望在数据中取出一个子集，这可能导致在子集中，因子在某些水平上的频次为 0，但 R 并不会自动舍弃舍弃频次为 0 的水平：

## 在原始数据汇总，hair_color共有12个水平
nlevels(factor(starwars$hair_color))
#> [1] 12

fct_count(starwars$hair_color)
#> # A tibble: 13 x 2
#>   f                 n
#>   <fct>         <int>
#> 1 auburn            1
#> 2 auburn, grey      1
#> 3 auburn, white     1
#> 4 black            13
#> 5 blond             3
#> 6 blonde            1
#> # ... with 7 more rows

## 筛选重量在70~135的角色，得到一个子集
(starwars_sub <- starwars %>%
  filter(between(mass, 70, 135)))
#> # A tibble: 34 x 13
#>   name  height  mass hair_color skin_color eye_color birth_year gender homeworld
#>   <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>  <chr>    
#> 1 Luke~    172    77 blond      fair       blue              19 male   Tatooine 
#> 2 C-3PO    167    75 <NA>       gold       yellow           112 <NA>   Tatooine 
#> 3 Owen~    178   120 brown, gr~ light      blue              52 male   Tatooine 
#> 4 Beru~    165    75 brown      light      blue              47 female Tatooine 
#> 5 Bigg~    183    84 black      light      brown             24 male   Tatooine 
#> 6 Obi-~    182    77 auburn, w~ fair       blue-gray         57 male   Stewjon  
#> # ... with 28 more rows, and 4 more variables: species <chr>, films <list>,
#> #   vehicles <list>, starships <list>

## 现在hair_color只在8个有效水平上有记录，但是总的水平个数没有改变 
nlevels(factor(starwars$hair_color))
#> [1] 12
fct_count(starwars_sub$hair_color)
#> # A tibble: 9 x 2
#>   f                 n
#>   <fct>         <int>
#> 1 auburn, white     1
#> 2 black             5
#> 3 blond             2
#> 4 brown             7
#> 5 brown, grey       1
#> 6 grey              1
#> # ... with 3 more rows

## 用fct_drop()舍弃频次为0的那些水平
starwars_sub$hair_color %>%
  fct_drop() %>%
  nlevels()
#> [1] 8

还可以通过给 only 参数指定一个向量指定想要丢弃的水平，只有频次为0且包含在该向量中的水平才会被丢弃：

f <- factor(c("a", "b"), levels = c("a", "b", "c"))
fct_drop(f)
#> [1] a b
#> Levels: a b

# Set only to restrict which levels to drop
fct_drop(f, only = "a") ## a水平上有频次，不会被丢弃；c水平上没有频次，但不在only中，也不会被丢弃
#> [1] a b
#> Levels: a b c
fct_drop(f, only = "c")
#> [1] a b
#> Levels: a b

5.3.4 Transforming NA levels

When a factor has missing values, these NAs will not be listed as a valid level. Though in some cases NA in a factor could be meaningful. As such we can replace factor() with fct_explicit_na() if necessary

f <- factor(c("a", "a", NA, NA, "a", "b", NA, "c", "a", "c", "b"))

levels(f)
#> [1] "a" "b" "c"
nlevels(f)
#> [1] 3

fct_explicit_na() gives a explicit factor level na_level to the NA:

fct_explicit_na(f)
#>  [1] a         a         (Missing) (Missing) a         b         (Missing)
#>  [8] c         a         c         b        
#> Levels: a b c (Missing)

fct_explicit_na(f, na_level = "Unknown")
#>  [1] a       a       Unknown Unknown a       b       Unknown c       a      
#> [10] c       b      
#> Levels: a b c Unknown