17.2 Wrangling

17.2.1 tidyr

tidyr provides a handful of tools for converting between implicit (absent rows) and explicit (NA) missing values, and for handling explicit NAs.

17.2.1.1 `drop_na()` drop rows containing missing values

# df %>% drop_na(Ozone, Solar.R, Wind, Temp, Month, Day)
airquality <- airquality %>% as_tibble()
airquality %>% drop_na() 
#> # A tibble: 111 x 6
#>   Ozone Solar.R  Wind  Temp Month   Day
#>   <int>   <int> <dbl> <int> <int> <int>
#> 1    41     190   7.4    67     5     1
#> 2    36     118   8      72     5     2
#> 3    12     149  12.6    74     5     3
#> 4    18     313  11.5    62     5     4
#> 5    23     299   8.6    65     5     7
#> 6    19      99  13.8    59     5     8
#> # ... with 105 more rows

airquality %>% drop_na(Solar.R)
#> # A tibble: 146 x 6
#>   Ozone Solar.R  Wind  Temp Month   Day
#>   <int>   <int> <dbl> <int> <int> <int>
#> 1    41     190   7.4    67     5     1
#> 2    36     118   8      72     5     2
#> 3    12     149  12.6    74     5     3
#> 4    18     313  11.5    62     5     4
#> 5    23     299   8.6    65     5     7
#> 6    19      99  13.8    59     5     8
#> # ... with 140 more rows

17.2.1.2 `replace_na()` replace missing values

# NULL are the list-col equivalent of NAs
df <- tibble(x = c(1, 2, NA), 
             y = c("a", NA, "b"), 
             z = list(1:5, NULL, 10:20)) 


df %>% mutate(x = replace_na(x, 0))
#> # A tibble: 3 x 3
#>       x y     z         
#>   <dbl> <chr> <list>    
#> 1     1 a     <int [5]> 
#> 2     2 <NA>  <NULL>    
#> 3     0 b     <int [11]>

df %>% replace_na(list(x = 0, y = "unknown"))
#> # A tibble: 3 x 3
#>       x y       z         
#>   <dbl> <chr>   <list>    
#> 1     1 a       <int [5]> 
#> 2     2 unknown <NULL>    
#> 3     0 b       <int [11]>

df %>% replace_na(list(z = 5))
#> # A tibble: 3 x 3
#>       x y     z         
#>   <dbl> <chr> <list>    
#> 1     1 a     <int [5]> 
#> 2     2 <NA>  <dbl [1]> 
#> 3    NA b     <int [11]>

17.2.1.3 `fill()` fill in missing values with previous or next value

df <- tibble(Month = rep(1:12, 2), 
             Year = c(2000, rep(NA, 11), 
                      2001, rep(NA, 11)))

df %>% fill(Year) # .direction = "down"
#> # A tibble: 24 x 2
#>   Month  Year
#>   <int> <dbl>
#> 1     1  2000
#> 2     2  2000
#> 3     3  2000
#> 4     4  2000
#> 5     5  2000
#> 6     6  2000
#> # ... with 18 more rows
df %>% fill(Year, .direction = "up")
#> # A tibble: 24 x 2
#>   Month  Year
#>   <int> <dbl>
#> 1     1  2000
#> 2     2  2001
#> 3     3  2001
#> 4     4  2001
#> 5     5  2001
#> 6     6  2001
#> # ... with 18 more rows
df %>% fill(Year, .direction = "updown")
#> # A tibble: 24 x 2
#>   Month  Year
#>   <int> <dbl>
#> 1     1  2000
#> 2     2  2001
#> 3     3  2001
#> 4     4  2001
#> 5     5  2001
#> 6     6  2001
#> # ... with 18 more rows
df %>% fill(Year, .direction = "downup")
#> # A tibble: 24 x 2
#>   Month  Year
#>   <int> <dbl>
#> 1     1  2000
#> 2     2  2000
#> 3     3  2000
#> 4     4  2000
#> 5     5  2000
#> 6     6  2000
#> # ... with 18 more rows

17.2.1.4 `full_seq()` create the full sequence of values in a vector

This is useful if you want to fill in missing values that should have been observed but weren’t. For example, full_seq(c(1, 2, 4, 6), 1) will return 1:6.

# `period` specify step length
full_seq(c(1, 3, 5, 11, 13), period = 2)
#> [1]  1  3  5  7  9 11 13

# Works with dates, too!
y <- lubridate::ymd(c("2020-01-01", "2020-01-03"))

full_seq(y, period = 1)
#> [1] "2020-01-01" "2020-01-02" "2020-01-03"

17.2.1.5 `expand()` expand data frame to include all combinations of values

expand() creates a data frame containing all conbinations of specified columns, often used in conjunction with left_join() to convert implicit missing values to explicit missing values, with anti_join() to figure out which combinations are missing.

To find all unique combinations of x, y and z, including those not found in the data, supply each variable as a separate argument. To find only the combinations that occur in the data, use nest: expand(df, nesting(x, y, z)).

You can combine the two forms. For example, expand(df, nesting(school_id, student_id), date) would produce a row for every student for each date.

For factors, the full set of levels (not just those that appear in the data) are used. For continuous variables, you may need to fill in values that don’t appear in the data: to do so use expressions like year = 2010:2020or year = full_seq(year, 1).

# All possible combinations of vs & cyl, even those that aren't
# present in the data
expand(mtcars, vs, cyl)
#> # A tibble: 6 x 2
#>      vs   cyl
#>   <dbl> <dbl>
#> 1     0     4
#> 2     0     6
#> 3     0     8
#> 4     1     4
#> 5     1     6
#> 6     1     8

# Only combinations of vs and cyl that appear in the data
expand(mtcars, nesting(vs, cyl))
#> # A tibble: 5 x 2
#>      vs   cyl
#>   <dbl> <dbl>
#> 1     0     4
#> 2     0     6
#> 3     0     8
#> 4     1     4
#> 5     1     6

# Implicit missings ---------------------------------------------------------
df <- tibble(
  year   = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
  qtr    = c(   1,    2,    3,    4,    1,    2,    3),
  return = rnorm(7)
)

df %>% expand(year, qtr)
#> # A tibble: 8 x 2
#>    year   qtr
#>   <dbl> <dbl>
#> 1  2010     1
#> 2  2010     2
#> 3  2010     3
#> 4  2010     4
#> 5  2012     1
#> 6  2012     2
#> # ... with 2 more rows

df %>% expand(year = 2010:2012, qtr)
#> # A tibble: 12 x 2
#>    year   qtr
#>   <int> <dbl>
#> 1  2010     1
#> 2  2010     2
#> 3  2010     3
#> 4  2010     4
#> 5  2011     1
#> 6  2011     2
#> # ... with 6 more rows

df %>% expand(year = full_seq(year, 1), qtr)
#> # A tibble: 12 x 2
#>    year   qtr
#>   <dbl> <dbl>
#> 1  2010     1
#> 2  2010     2
#> 3  2010     3
#> 4  2010     4
#> 5  2011     1
#> 6  2011     2
#> # ... with 6 more rows

# Each person was given one of two treatments, repeated three times
# But some of the replications haven't happened yet, so we have
# incomplete data:
experiment <- tibble(
  name = rep(c("Alex", "Robert", "Sam"), c(3, 2, 1)),
  trt  = rep(c("a", "b", "a"), c(3, 2, 1)),
  rep = c(1, 2, 3, 1, 2, 1),
  measurement_1 = runif(6),
  measurement_2 = runif(6)
)

# We can figure out the complete set of data with expand()
# Each person only gets one treatment, so we nest name and trt together:
all <- experiment %>% expand(nesting(name, trt), rep)
all
#> # A tibble: 9 x 3
#>   name   trt     rep
#>   <chr>  <chr> <dbl>
#> 1 Alex   a         1
#> 2 Alex   a         2
#> 3 Alex   a         3
#> 4 Robert b         1
#> 5 Robert b         2
#> 6 Robert b         3
#> # ... with 3 more rows

# use left_join to convert implicit missing values to explicit missing values
all %>% left_join(experiment)
#> # A tibble: 9 x 5
#>   name   trt     rep measurement_1 measurement_2
#>   <chr>  <chr> <dbl>         <dbl>         <dbl>
#> 1 Alex   a         1         0.614         0.379
#> 2 Alex   a         2         0.915         0.811
#> 3 Alex   a         3         0.838         0.475
#> 4 Robert b         1         0.855         0.838
#> 5 Robert b         2         0.501         0.739
#> 6 Robert b         3        NA            NA    
#> # ... with 3 more rows
# can use anti_join to figure out which observations are missing
all %>% anti_join(experiment)
#> # A tibble: 3 x 3
#>   name   trt     rep
#>   <chr>  <chr> <dbl>
#> 1 Robert b         3
#> 2 Sam    a         2
#> 3 Sam    a         3


# And use right_join to add in the appropriate missing values to the
# original data
experiment %>% right_join(all)
#> # A tibble: 9 x 5
#>   name   trt     rep measurement_1 measurement_2
#>   <chr>  <chr> <dbl>         <dbl>         <dbl>
#> 1 Alex   a         1         0.614         0.379
#> 2 Alex   a         2         0.915         0.811
#> 3 Alex   a         3         0.838         0.475
#> 4 Robert b         1         0.855         0.838
#> 5 Robert b         2         0.501         0.739
#> 6 Robert b         3        NA            NA    
#> # ... with 3 more rows

complete() is a short hand function around expand() + left_join(): firt create specified combinations and then left join original data to convert implicit missing values to explicit missing values:

experiment %>% 
  complete(rep, nesting(name, trt))
#> # A tibble: 9 x 5
#>     rep name   trt   measurement_1 measurement_2
#>   <dbl> <chr>  <chr>         <dbl>         <dbl>
#> 1     1 Alex   a             0.614         0.379
#> 2     1 Robert b             0.855         0.838
#> 3     1 Sam    a             0.158         0.278
#> 4     2 Alex   a             0.915         0.811
#> 5     2 Robert b             0.501         0.739
#> 6     2 Sam    a            NA            NA    
#> # ... with 3 more rows

17.2.1.6 `expand_grid`: create a tibble from all combinations of inputs

expand_grid() is analogus to a (atomic) vector version if expand(). Instead of taking in a data frame, expand_grid() use multiple name-value pairs to generate all combinations :

expand_grid(x = c(3, 2, 1), y = c(10, 5), z = letters[1:3])
#> # A tibble: 18 x 3
#>       x     y z    
#>   <dbl> <dbl> <chr>
#> 1     3    10 a    
#> 2     3    10 b    
#> 3     3    10 c    
#> 4     3     5 a    
#> 5     3     5 b    
#> 6     3     5 c    
#> # ... with 12 more rows

crossing() is a wrapper around expand_grid() that deduplicates and sorts each input.

crossing(x = c(3, 2, 1), y = c(10, 5), z = letters[1:3])
#> # A tibble: 18 x 3
#>       x     y z    
#>   <dbl> <dbl> <chr>
#> 1     1     5 a    
#> 2     1     5 b    
#> 3     1     5 c    
#> 4     1    10 a    
#> 5     1    10 b    
#> 6     1    10 c    
#> # ... with 12 more rows

17.2.2 janitor

library(janitor)

remove_empty(dat, which = c("rows", "cols"))

(df <- tibble(x = c(1, NA, 3),
             y = rep(NA, 3),
             z = c(4, NA, 5)))
#> # A tibble: 3 x 3
#>       x y         z
#>   <dbl> <lgl> <dbl>
#> 1     1 NA        4
#> 2    NA NA       NA
#> 3     3 NA        5

df %>% remove_empty()
#> # A tibble: 2 x 2
#>       x     z
#>   <dbl> <dbl>
#> 1     1     4
#> 2     3     5
df %>% remove_empty("rows")
#> # A tibble: 2 x 3
#>       x y         z
#>   <dbl> <lgl> <dbl>
#> 1     1 NA        4
#> 2     3 NA        5
df %>% remove_empty("cols")
#> # A tibble: 3 x 2
#>       x     z
#>   <dbl> <dbl>
#> 1     1     4
#> 2    NA    NA
#> 3     3     5

17.2.3 visdat

visdat::vis_missing()

library(visdat)
vis_miss(df)

Advanced: mice, Amelia

17.2 Wrangling

17.2.1 tidyr

17.2.1.1 drop_na() drop rows containing missing values

17.2.1.2 replace_na() replace missing values

17.2.1.3 fill() fill in missing values with previous or next value

17.2.1.4 full_seq() create the full sequence of values in a vector

17.2.1.5 expand() expand data frame to include all combinations of values

17.2.1.6 expand_grid: create a tibble from all combinations of inputs