3.3 Collapse a categorical variable into fewer levels

Suppose you decide that for your age group variable you would like just 2 levels instead of 5, and that you want the two levels to represent ages 40-59y and 60-90y. The code below collapses the 5 AgeGp categories into two. In this example, we will give the new variable a new name, so we do not need to make a copy of the original variable.

# Initialize a new variable in the data.frame
mydat$AgeGp2 <- NA

# Set the new levels
mydat$AgeGp2[mydat$AgeGp %in% c("40-49y", "50-59y")          ] <- 1
mydat$AgeGp2[mydat$AgeGp %in% c("60-69y", "70-79y", "80-90y")] <- 2

# Convert to a factor
mydat$AgeGp2 <- factor(mydat$AgeGp2,
                       levels = 1:2,
                       labels = c("40-59y",

# Compare before and after
table(mydat$AgeGp, mydat$AgeGp2, exclude = NULL)
##          40-59y 60-90y
##   40-49y     64      0
##   50-59y    203      0
##   60-69y      0    183
##   70-79y      0     40
##   80-90y      0     40
# Examine the new levels
## [1] "40-59y" "60-90y"

The tidyverse version of this task is as follows. If the variable is already a factor, you can use fct_collapse(). Otherwise, use case_when() and then convert to a factor if desired.

# If the old variable is already a factor

# List the levels
table(mydat_tibble$AgeGp, exclude = NULL)

mydat_tibble <- mydat_tibble %>%
  mutate(AgeGp2 = fct_collapse(AgeGp,
                               "40-59y" = c("40-49y", "50-59y"),
                               "60-90y" = c("60-69y", "70-79y", "80-90y")))

# Compare before and after
mydat_tibble %>% 
  count(AgeGp, AgeGp2)

# Examine the new levels

# If the old variable is not a factor...

# List the levels
table(mydat_tibble$AgeGp_orig, exclude = NULL)

mydat_tibble <- mydat_tibble %>%
  mutate(AgeGp2 = case_when(AgeGp_orig %in% 1:2 ~ 1,
                            AgeGp_orig %in% 3:5 ~ 2,
                            TRUE                ~ AgeGp_orig)) %>% 
  mutate(AgeGp2 = factor(AgeGp2,
                         levels = 1:2,
                         labels = c("40-59y", "60-90y")))

# Compare before and after
mydat_tibble %>% 
  count(AgeGp_orig, AgeGp2)

# Examine the new levels