17.2 Wrangling
17.2.1 tidyr
tidyr
provides a handful of tools for converting between implicit (absent rows) and explicit (NA
) missing values, and for handling explicit NA
s.
17.2.1.1 drop_na()
drop rows containing missing values
# df %>% drop_na(Ozone, Solar.R, Wind, Temp, Month, Day)
airquality <- airquality %>% as_tibble()
airquality %>% drop_na()
#> # A tibble: 111 x 6
#> Ozone Solar.R Wind Temp Month Day
#> <int> <int> <dbl> <int> <int> <int>
#> 1 41 190 7.4 67 5 1
#> 2 36 118 8 72 5 2
#> 3 12 149 12.6 74 5 3
#> 4 18 313 11.5 62 5 4
#> 5 23 299 8.6 65 5 7
#> 6 19 99 13.8 59 5 8
#> # ... with 105 more rows
airquality %>% drop_na(Solar.R)
#> # A tibble: 146 x 6
#> Ozone Solar.R Wind Temp Month Day
#> <int> <int> <dbl> <int> <int> <int>
#> 1 41 190 7.4 67 5 1
#> 2 36 118 8 72 5 2
#> 3 12 149 12.6 74 5 3
#> 4 18 313 11.5 62 5 4
#> 5 23 299 8.6 65 5 7
#> 6 19 99 13.8 59 5 8
#> # ... with 140 more rows
17.2.1.2 replace_na()
replace missing values
# NULL are the list-col equivalent of NAs
df <- tibble(x = c(1, 2, NA),
y = c("a", NA, "b"),
z = list(1:5, NULL, 10:20))
df %>% mutate(x = replace_na(x, 0))
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <chr> <list>
#> 1 1 a <int [5]>
#> 2 2 <NA> <NULL>
#> 3 0 b <int [11]>
df %>% replace_na(list(x = 0, y = "unknown"))
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <chr> <list>
#> 1 1 a <int [5]>
#> 2 2 unknown <NULL>
#> 3 0 b <int [11]>
df %>% replace_na(list(z = 5))
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <chr> <list>
#> 1 1 a <int [5]>
#> 2 2 <NA> <dbl [1]>
#> 3 NA b <int [11]>
17.2.1.3 fill()
fill in missing values with previous or next value
df <- tibble(Month = rep(1:12, 2),
Year = c(2000, rep(NA, 11),
2001, rep(NA, 11)))
df %>% fill(Year) # .direction = "down"
#> # A tibble: 24 x 2
#> Month Year
#> <int> <dbl>
#> 1 1 2000
#> 2 2 2000
#> 3 3 2000
#> 4 4 2000
#> 5 5 2000
#> 6 6 2000
#> # ... with 18 more rows
df %>% fill(Year, .direction = "up")
#> # A tibble: 24 x 2
#> Month Year
#> <int> <dbl>
#> 1 1 2000
#> 2 2 2001
#> 3 3 2001
#> 4 4 2001
#> 5 5 2001
#> 6 6 2001
#> # ... with 18 more rows
df %>% fill(Year, .direction = "updown")
#> # A tibble: 24 x 2
#> Month Year
#> <int> <dbl>
#> 1 1 2000
#> 2 2 2001
#> 3 3 2001
#> 4 4 2001
#> 5 5 2001
#> 6 6 2001
#> # ... with 18 more rows
df %>% fill(Year, .direction = "downup")
#> # A tibble: 24 x 2
#> Month Year
#> <int> <dbl>
#> 1 1 2000
#> 2 2 2000
#> 3 3 2000
#> 4 4 2000
#> 5 5 2000
#> 6 6 2000
#> # ... with 18 more rows
17.2.1.4 full_seq()
create the full sequence of values in a vector
This is useful if you want to fill in missing values that should have been observed but weren’t. For example, full_seq(c(1, 2, 4, 6), 1)
will return 1:6.
17.2.1.5 expand()
expand data frame to include all combinations of values
expand()
creates a data frame containing all conbinations of specified columns, often used in conjunction with left_join()
to convert implicit missing values to explicit missing values, with anti_join()
to figure out which combinations are missing.
To find all unique combinations of x
, y
and z
, including those not found in the data, supply each variable as a separate argument. To find only the combinations that occur in the data, use nest: expand(df, nesting(x, y, z))
.
You can combine the two forms. For example, expand(df, nesting(school_id, student_id), date)
would produce a row for every student for each date.
For factors, the full set of levels (not just those that appear in the data) are used. For continuous variables, you may need to fill in values that don’t appear in the data: to do so use expressions like year = 2010:2020
or year = full_seq(year, 1)
.
# All possible combinations of vs & cyl, even those that aren't
# present in the data
expand(mtcars, vs, cyl)
#> # A tibble: 6 x 2
#> vs cyl
#> <dbl> <dbl>
#> 1 0 4
#> 2 0 6
#> 3 0 8
#> 4 1 4
#> 5 1 6
#> 6 1 8
# Only combinations of vs and cyl that appear in the data
expand(mtcars, nesting(vs, cyl))
#> # A tibble: 5 x 2
#> vs cyl
#> <dbl> <dbl>
#> 1 0 4
#> 2 0 6
#> 3 0 8
#> 4 1 4
#> 5 1 6
# Implicit missings ---------------------------------------------------------
df <- tibble(
year = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
qtr = c( 1, 2, 3, 4, 1, 2, 3),
return = rnorm(7)
)
df %>% expand(year, qtr)
#> # A tibble: 8 x 2
#> year qtr
#> <dbl> <dbl>
#> 1 2010 1
#> 2 2010 2
#> 3 2010 3
#> 4 2010 4
#> 5 2012 1
#> 6 2012 2
#> # ... with 2 more rows
df %>% expand(year = 2010:2012, qtr)
#> # A tibble: 12 x 2
#> year qtr
#> <int> <dbl>
#> 1 2010 1
#> 2 2010 2
#> 3 2010 3
#> 4 2010 4
#> 5 2011 1
#> 6 2011 2
#> # ... with 6 more rows
df %>% expand(year = full_seq(year, 1), qtr)
#> # A tibble: 12 x 2
#> year qtr
#> <dbl> <dbl>
#> 1 2010 1
#> 2 2010 2
#> 3 2010 3
#> 4 2010 4
#> 5 2011 1
#> 6 2011 2
#> # ... with 6 more rows
# Each person was given one of two treatments, repeated three times
# But some of the replications haven't happened yet, so we have
# incomplete data:
experiment <- tibble(
name = rep(c("Alex", "Robert", "Sam"), c(3, 2, 1)),
trt = rep(c("a", "b", "a"), c(3, 2, 1)),
rep = c(1, 2, 3, 1, 2, 1),
measurement_1 = runif(6),
measurement_2 = runif(6)
)
# We can figure out the complete set of data with expand()
# Each person only gets one treatment, so we nest name and trt together:
all <- experiment %>% expand(nesting(name, trt), rep)
all
#> # A tibble: 9 x 3
#> name trt rep
#> <chr> <chr> <dbl>
#> 1 Alex a 1
#> 2 Alex a 2
#> 3 Alex a 3
#> 4 Robert b 1
#> 5 Robert b 2
#> 6 Robert b 3
#> # ... with 3 more rows
# use left_join to convert implicit missing values to explicit missing values
all %>% left_join(experiment)
#> # A tibble: 9 x 5
#> name trt rep measurement_1 measurement_2
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Alex a 1 0.614 0.379
#> 2 Alex a 2 0.915 0.811
#> 3 Alex a 3 0.838 0.475
#> 4 Robert b 1 0.855 0.838
#> 5 Robert b 2 0.501 0.739
#> 6 Robert b 3 NA NA
#> # ... with 3 more rows
# can use anti_join to figure out which observations are missing
all %>% anti_join(experiment)
#> # A tibble: 3 x 3
#> name trt rep
#> <chr> <chr> <dbl>
#> 1 Robert b 3
#> 2 Sam a 2
#> 3 Sam a 3
# And use right_join to add in the appropriate missing values to the
# original data
experiment %>% right_join(all)
#> # A tibble: 9 x 5
#> name trt rep measurement_1 measurement_2
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 Alex a 1 0.614 0.379
#> 2 Alex a 2 0.915 0.811
#> 3 Alex a 3 0.838 0.475
#> 4 Robert b 1 0.855 0.838
#> 5 Robert b 2 0.501 0.739
#> 6 Robert b 3 NA NA
#> # ... with 3 more rows
complete()
is a short hand function around expand() + left_join()
: firt create specified combinations and then left join original data to convert implicit missing values to explicit missing values:
experiment %>%
complete(rep, nesting(name, trt))
#> # A tibble: 9 x 5
#> rep name trt measurement_1 measurement_2
#> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 Alex a 0.614 0.379
#> 2 1 Robert b 0.855 0.838
#> 3 1 Sam a 0.158 0.278
#> 4 2 Alex a 0.915 0.811
#> 5 2 Robert b 0.501 0.739
#> 6 2 Sam a NA NA
#> # ... with 3 more rows
17.2.1.6 expand_grid
: create a tibble from all combinations of inputs
expand_grid()
is analogus to a (atomic) vector version if expand()
. Instead of taking in a data frame, expand_grid()
use multiple name-value pairs to generate all combinations :
expand_grid(x = c(3, 2, 1), y = c(10, 5), z = letters[1:3])
#> # A tibble: 18 x 3
#> x y z
#> <dbl> <dbl> <chr>
#> 1 3 10 a
#> 2 3 10 b
#> 3 3 10 c
#> 4 3 5 a
#> 5 3 5 b
#> 6 3 5 c
#> # ... with 12 more rows
crossing()
is a wrapper around expand_grid()
that deduplicates and sorts each input.
17.2.2 janitor
remove_empty(dat, which = c("rows", "cols"))
(df <- tibble(x = c(1, NA, 3),
y = rep(NA, 3),
z = c(4, NA, 5)))
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <lgl> <dbl>
#> 1 1 NA 4
#> 2 NA NA NA
#> 3 3 NA 5
df %>% remove_empty()
#> # A tibble: 2 x 2
#> x z
#> <dbl> <dbl>
#> 1 1 4
#> 2 3 5
df %>% remove_empty("rows")
#> # A tibble: 2 x 3
#> x y z
#> <dbl> <lgl> <dbl>
#> 1 1 NA 4
#> 2 3 NA 5
df %>% remove_empty("cols")
#> # A tibble: 3 x 2
#> x z
#> <dbl> <dbl>
#> 1 1 4
#> 2 NA NA
#> 3 3 5