# 第 18 章 因子型变量

## 18.1 什么是因子

• 存储类别的数据类型
• 离散变量
• 因子的层级是有限的，只能取因子层级中的值或缺失(NA)

## 18.2 创建因子

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(palmerpenguins)
income <- c("low", "high", "medium", "medium", "low", "high",  "high")
factor(income)
## [1] low    high   medium medium low    high   high
## Levels: high low medium

factor(income, levels = c("low", "high", "medium") )
## [1] low    high   medium medium low    high   high
## Levels: low high medium

factor(income, levels = c("low", "high") )
## [1] low  high <NA> <NA> low  high high
## Levels: low high

library(forcats)

## 18.3 调整因子顺序

x <- factor(income)
x
## [1] low    high   medium medium low    high   high
## Levels: high low medium

x %>% fct_relevel( c("high", "medium", "low"))
## [1] low    high   medium medium low    high   high
## Levels: high medium low

x %>% fct_relevel( c("medium"))
## [1] low    high   medium medium low    high   high
## Levels: medium high low

x %>% fct_relevel("medium", after = Inf)
## [1] low    high   medium medium low    high   high
## Levels: high low medium

x %>% fct_inorder()
## [1] low    high   medium medium low    high   high
## Levels: low high medium

x %>% fct_reorder(c(1:7), .fun = median)  
## [1] low    high   medium medium low    high   high
## Levels: low medium high

## 18.4 应用

d <- tibble(
x = c("a","a", "b", "b", "c", "c"),
y = c(2, 2, 1, 5,  0, 3)

)
d
## # A tibble: 6 × 2
##   x         y
##   <chr> <dbl>
## 1 a         2
## 2 a         2
## 3 b         1
## 4 b         5
## 5 c         0
## 6 c         3

d %>%
ggplot(aes(x = x, y = y)) +
geom_point()

### 18.4.1 fct_reorder()

fct_reorder()可以让x的顺序按照x中每个分类变量对应y值的中位数升序排序，具体为

• a对应的y值c(2, 2) 中位数是median(c(2, 2)) = 2
• b对应的y值c(1, 5) 中位数是median(c(1, 5)) = 3
• c对应的y值c(0, 3) 中位数是median(c(0, 3)) = 1.5

d %>%
ggplot(aes(x = fct_reorder(x, y, .fun = median), y = y)) +
geom_point()

d %>%
ggplot(aes(x = fct_reorder(x, y, .fun = median, .desc = TRUE), y = y)) +
geom_point()

d %>%
mutate(x = fct_reorder(x, y, .fun = median, .desc = TRUE)) %>%
ggplot(aes(x = x, y = y)) +
geom_point()

d %>%
mutate(x = fct_reorder(x, y, .fun = min, .desc = TRUE)) %>%
ggplot(aes(x = x, y = y)) +
geom_point()

### 18.4.2 fct_rev()

d %>%
mutate(x = fct_rev(x)) %>%
ggplot(aes(x = x, y = y)) +
geom_point()

### 18.4.3 fct_relevel()

d %>%
mutate(
x = fct_relevel(x, c("c", "a", "b"))
) %>%

ggplot(aes(x = x, y = y)) +
geom_point()

## 18.5 可视化中应用

ggplot(penguins, aes(y = species)) +
geom_bar()
ggplot(penguins, aes(y = fct_rev(species))) +
geom_bar()
penguins %>%
count(species) %>%
pull(species)

penguins %>%
count(species) %>%
mutate(species = fct_relevel(species, "Chinstrap", "Gentoo", "Adelie")) %>%
pull(species)
# Move "Chinstrap" in front, rest alphabetic
ggplot(penguins, aes(y = fct_relevel(species, "Chinstrap"))) +
geom_bar()
# Use order "Chinstrap", "Gentoo", "Adelie"
ggplot(penguins, aes(y = fct_relevel(species, "Chinstrap", "Gentoo", "Adelie"))) +
geom_bar()
penguins %>%
mutate(species = fct_relevel(species, "Chinstrap", "Gentoo", "Adelie")) %>%
ggplot(aes(y = species)) +
geom_bar()
ggplot(penguins, aes(y = fct_relevel(species, "Adelie", after = Inf))) +
geom_bar()
# Use the order defined by the number of penguins of different species
# The order is descending, from most frequent to least frequent

penguins %>%
mutate(species = fct_infreq(species)) %>%
ggplot(aes(y = species)) +
geom_bar()
penguins %>%
mutate(species = fct_rev(fct_infreq(species))) %>%
ggplot(aes(y = species)) +
geom_bar()
# Reorder based on numeric values
penguins %>%
count(species) %>%
mutate(species = fct_reorder(species, n)) %>%
ggplot(aes(n, species)) +
geom_col()

## 18.6 作业

• 画出的2007年美洲人口寿命的柱状图，要求从高到低排序
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.2.2
gapminder %>%
filter(
year == 2007,
continent == "Americas"
)
## # A tibble: 25 × 6
##    country            continent  year lifeExp       pop gdpPercap
##    <fct>              <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Argentina          Americas   2007    75.3  40301927    12779.
##  2 Bolivia            Americas   2007    65.6   9119152     3822.
##  3 Brazil             Americas   2007    72.4 190010647     9066.
##  4 Canada             Americas   2007    80.7  33390141    36319.
##  5 Chile              Americas   2007    78.6  16284741    13172.
##  6 Colombia           Americas   2007    72.9  44227550     7007.
##  7 Costa Rica         Americas   2007    78.8   4133884     9645.
##  8 Cuba               Americas   2007    78.3  11416987     8948.
##  9 Dominican Republic Americas   2007    72.2   9319622     6025.
## 10 Ecuador            Americas   2007    75.0  13755680     6873.
## # ℹ 15 more rows
• 这是四个国家人口寿命的变化图
gapminder %>%
filter(country %in% c("Norway", "Portugal", "Spain", "Austria")) %>%
ggplot(aes(year, lifeExp)) + geom_line() +
facet_wrap(vars(country), nrow = 1)
• 要求给四个分面排序，按每个国家寿命的中位数

• 要求给四个分面排序，按每个国家寿命差（最大值减去最小值）