3.3 Collapse a categorical variable into fewer levels
Suppose you decide that for your age group variable you would like just 2 levels instead of 5, and that you want the two levels to represent ages 40-59y and 60-90y. The code below collapses the 5 AgeGp
categories into two. In this example, we will give the new variable a new name, so we do not need to make a copy of the original variable.
# Initialize a new variable in the data.frame
mydat$AgeGp2 <- NA
# Set the new levels
mydat$AgeGp2[mydat$AgeGp %in% c("40-49y", "50-59y") ] <- 1
mydat$AgeGp2[mydat$AgeGp %in% c("60-69y", "70-79y", "80-90y")] <- 2
# Convert to a factor
mydat$AgeGp2 <- factor(mydat$AgeGp2,
levels = 1:2,
labels = c("40-59y",
"60-90y"))
# Compare before and after
table(mydat$AgeGp, mydat$AgeGp2, useNA = "ifany")
##
## 40-59y 60-90y
## 40-49y 64 0
## 50-59y 203 0
## 60-69y 0 183
## 70-79y 0 40
## 80-90y 0 40
## [1] "40-59y" "60-90y"
The tidyverse
version of this task is as follows. If the variable is already a factor, you can use fct_collapse()
. Otherwise, use case_when()
and then convert to a factor
if desired.
# If the old variable is already a factor
is.factor(mydat_tibble$AgeGp)
# List the levels
table(mydat_tibble$AgeGp, useNA = "ifany")
mydat_tibble <- mydat_tibble %>%
mutate(AgeGp2 = fct_collapse(AgeGp,
"40-59y" = c("40-49y", "50-59y"),
"60-90y" = c("60-69y", "70-79y", "80-90y")))
# Compare before and after
mydat_tibble %>%
count(AgeGp, AgeGp2)
# Examine the new levels
levels(mydat_tibble$AgeGp2)
# If the old variable is not a factor...
is.factor(mydat_tibble$AgeGp_orig)
# List the levels
table(mydat_tibble$AgeGp_orig, useNA = "ifany")
mydat_tibble <- mydat_tibble %>%
mutate(AgeGp2 = case_when(AgeGp_orig %in% 1:2 ~ 1,
AgeGp_orig %in% 3:5 ~ 2,
TRUE ~ AgeGp_orig)) %>%
mutate(AgeGp2 = factor(AgeGp2,
levels = 1:2,
labels = c("40-59y", "60-90y")))
# Compare before and after
mydat_tibble %>%
count(AgeGp_orig, AgeGp2)
# Examine the new levels
levels(mydat_tibble$AgeGp2)