Chapter 6 Tidy Data with Tidyr

Topics covered:

gather
spread
separate
complete
fill

6.1 visualizing data

#rate per 10,000
table1%>%
  mutate(rate=cases/population*10000)

## # A tibble: 6 x 5
##   country      year  cases population  rate
##   <chr>       <int>  <int>      <int> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67

#cases per year
table1%>%
  count(year, wt=cases)

## # A tibble: 2 x 2
##    year      n
## * <int>  <int>
## 1  1999 250740
## 2  2000 296920

library(ggplot2)
ggplot(table1, aes(year, cases))+
  geom_line(aes(group=country), color='grey50')+
  geom_point(aes(color=country))

6.2 gather

#gathering
(tidy_4a <- table4a%>%
  gather('1999', '2000', key = 'year', value = 'cases'))

## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766

(tidy_4b <- table4b%>%
  gather('1999', '2000', key = 'year', value = 'population'))

## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Brazil      1999   172006362
## 3 China       1999  1272915272
## 4 Afghanistan 2000    20595360
## 5 Brazil      2000   174504898
## 6 China       2000  1280428583

left_join(tidy_4a, tidy_4b)

## Joining, by = c("country", "year")

## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 1999     745   19987071
## 2 Brazil      1999   37737  172006362
## 3 China       1999  212258 1272915272
## 4 Afghanistan 2000    2666   20595360
## 5 Brazil      2000   80488  174504898
## 6 China       2000  213766 1280428583

6.3 spread

#spreading
#key is the variable we aim to spread
table2

## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

spread(table2, key = type, value = count)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

6.4 separate

#separate
table3

## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

table3 %>%
  separate(rate, into = c("cases", 'population'))

## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

#by default, separate at non-alphanumeric character

table3%>%
  separate(rate, into = c('cases', 'population'), sep = '/')

## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

table3%>%
  separate(rate, into = c('cases', 'population'), sep = '/',
           convert = T)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

table3%>%
  separate(year, into = c('century', 'year'), sep = 2)

## # A tibble: 6 x 4
##   country     century year  rate             
##   <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

6.5 unite

#unite
table5%>%
  unite(new, century, year)

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 19_99 745/19987071     
## 2 Afghanistan 20_00 2666/20595360    
## 3 Brazil      19_99 37737/172006362  
## 4 Brazil      20_00 80488/174504898  
## 5 China       19_99 212258/1272915272
## 6 China       20_00 213766/1280428583

#by default sep=underscore
table5%>%
  unite(new, century, year, sep = '')

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583

#missing value
(stocks <- tibble(
  year=rep(c(2015,2016), each=4)[-8],
  qtr=rep(1:4, times=2)[-5],
  return=c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
))

## # A tibble: 7 x 3
##    year   qtr return
##   <dbl> <int>  <dbl>
## 1  2015     1   1.88
## 2  2015     2   0.59
## 3  2015     3   0.35
## 4  2015     4  NA   
## 5  2016     2   0.92
## 6  2016     3   0.17
## 7  2016     4   2.66

stocks%>%
  spread(key=year, value=return)

## # A tibble: 4 x 3
##     qtr `2015` `2016`
##   <int>  <dbl>  <dbl>
## 1     1   1.88  NA   
## 2     2   0.59   0.92
## 3     3   0.35   0.17
## 4     4  NA      2.66

stocks%>%
  spread(year, return)%>%
  gather('2015', '2016', key = 'year', value = 'return', na.rm = T)

## # A tibble: 6 x 3
##     qtr year  return
##   <int> <chr>  <dbl>
## 1     1 2015    1.88
## 2     2 2015    0.59
## 3     3 2015    0.35
## 4     2 2016    0.92
## 5     3 2016    0.17
## 6     4 2016    2.66

#stocks%>%
#  spread(year, return)%>%
#  gather(year, return, '2015','2016', na.rm = T)

#complete cases
#identify unique combinations 
stocks%>%
  complete(year, qtr)

## # A tibble: 8 x 3
##    year   qtr return
##   <dbl> <int>  <dbl>
## 1  2015     1   1.88
## 2  2015     2   0.59
## 3  2015     3   0.35
## 4  2015     4  NA   
## 5  2016     1  NA   
## 6  2016     2   0.92
## 7  2016     3   0.17
## 8  2016     4   2.66

#fill
(treatment <- tribble(
  ~person, ~treatment, ~response,
  "Derrick Whitemore", 1, 7,
  NA, 2, 10,
  NA, 3, 9,
  "Katherine Burke", 1, 4
))

## # A tibble: 4 x 3
##   person            treatment response
##   <chr>                 <dbl>    <dbl>
## 1 Derrick Whitemore         1        7
## 2 <NA>                      2       10
## 3 <NA>                      3        9
## 4 Katherine Burke           1        4

#last observation carried forward
treatment %>%
  fill(person)

## # A tibble: 4 x 3
##   person            treatment response
##   <chr>                 <dbl>    <dbl>
## 1 Derrick Whitemore         1        7
## 2 Derrick Whitemore         2       10
## 3 Derrick Whitemore         3        9
## 4 Katherine Burke           1        4

#an example
who

## # A tibble: 7,240 x 60
##    country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554 new_sp_m5564 new_sp_m65
##    <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>        <int>        <int>      <int>
##  1 Afghan… AF    AFG    1980          NA           NA           NA           NA           NA           NA         NA
##  2 Afghan… AF    AFG    1981          NA           NA           NA           NA           NA           NA         NA
##  3 Afghan… AF    AFG    1982          NA           NA           NA           NA           NA           NA         NA
##  4 Afghan… AF    AFG    1983          NA           NA           NA           NA           NA           NA         NA
##  5 Afghan… AF    AFG    1984          NA           NA           NA           NA           NA           NA         NA
##  6 Afghan… AF    AFG    1985          NA           NA           NA           NA           NA           NA         NA
##  7 Afghan… AF    AFG    1986          NA           NA           NA           NA           NA           NA         NA
##  8 Afghan… AF    AFG    1987          NA           NA           NA           NA           NA           NA         NA
##  9 Afghan… AF    AFG    1988          NA           NA           NA           NA           NA           NA         NA
## 10 Afghan… AF    AFG    1989          NA           NA           NA           NA           NA           NA         NA
## # … with 7,230 more rows, and 49 more variables: new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
## #   new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
## #   new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
## #   new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
## #   new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>, new_ep_m4554 <int>,
## #   new_ep_m5564 <int>, new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>, new_ep_f3544 <int>,
## #   new_ep_f4554 <int>, new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>, newrel_m2534 <int>,
## #   newrel_m3544 <int>, newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>, newrel_f1524 <int>,
## #   newrel_f2534 <int>, newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>

names(who)

##  [1] "country"      "iso2"         "iso3"         "year"         "new_sp_m014"  "new_sp_m1524" "new_sp_m2534"
##  [8] "new_sp_m3544" "new_sp_m4554" "new_sp_m5564" "new_sp_m65"   "new_sp_f014"  "new_sp_f1524" "new_sp_f2534"
## [15] "new_sp_f3544" "new_sp_f4554" "new_sp_f5564" "new_sp_f65"   "new_sn_m014"  "new_sn_m1524" "new_sn_m2534"
## [22] "new_sn_m3544" "new_sn_m4554" "new_sn_m5564" "new_sn_m65"   "new_sn_f014"  "new_sn_f1524" "new_sn_f2534"
## [29] "new_sn_f3544" "new_sn_f4554" "new_sn_f5564" "new_sn_f65"   "new_ep_m014"  "new_ep_m1524" "new_ep_m2534"
## [36] "new_ep_m3544" "new_ep_m4554" "new_ep_m5564" "new_ep_m65"   "new_ep_f014"  "new_ep_f1524" "new_ep_f2534"
## [43] "new_ep_f3544" "new_ep_f4554" "new_ep_f5564" "new_ep_f65"   "newrel_m014"  "newrel_m1524" "newrel_m2534"
## [50] "newrel_m3544" "newrel_m4554" "newrel_m5564" "newrel_m65"   "newrel_f014"  "newrel_f1524" "newrel_f2534"
## [57] "newrel_f3544" "newrel_f4554" "newrel_f5564" "newrel_f65"

(who1<-who%>%
    gather(new_sp_m014:newrel_f65, key = 'key', value = 'cases', na.rm = T))

## # A tibble: 76,046 x 6
##    country     iso2  iso3   year key         cases
##    <chr>       <chr> <chr> <int> <chr>       <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014     0
##  2 Afghanistan AF    AFG    1998 new_sp_m014    30
##  3 Afghanistan AF    AFG    1999 new_sp_m014     8
##  4 Afghanistan AF    AFG    2000 new_sp_m014    52
##  5 Afghanistan AF    AFG    2001 new_sp_m014   129
##  6 Afghanistan AF    AFG    2002 new_sp_m014    90
##  7 Afghanistan AF    AFG    2003 new_sp_m014   127
##  8 Afghanistan AF    AFG    2004 new_sp_m014   139
##  9 Afghanistan AF    AFG    2005 new_sp_m014   151
## 10 Afghanistan AF    AFG    2006 new_sp_m014   193
## # … with 76,036 more rows

who1%>%
  count(key)

## # A tibble: 56 x 2
##    key              n
##  * <chr>        <int>
##  1 new_ep_f014   1032
##  2 new_ep_f1524  1021
##  3 new_ep_f2534  1021
##  4 new_ep_f3544  1021
##  5 new_ep_f4554  1017
##  6 new_ep_f5564  1017
##  7 new_ep_f65    1014
##  8 new_ep_m014   1038
##  9 new_ep_m1524  1026
## 10 new_ep_m2534  1020
## # … with 46 more rows

(who2 <- who1%>%
  mutate(key=stringr::str_replace(key, "newrel", "new_rel")))

## # A tibble: 76,046 x 6
##    country     iso2  iso3   year key         cases
##    <chr>       <chr> <chr> <int> <chr>       <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014     0
##  2 Afghanistan AF    AFG    1998 new_sp_m014    30
##  3 Afghanistan AF    AFG    1999 new_sp_m014     8
##  4 Afghanistan AF    AFG    2000 new_sp_m014    52
##  5 Afghanistan AF    AFG    2001 new_sp_m014   129
##  6 Afghanistan AF    AFG    2002 new_sp_m014    90
##  7 Afghanistan AF    AFG    2003 new_sp_m014   127
##  8 Afghanistan AF    AFG    2004 new_sp_m014   139
##  9 Afghanistan AF    AFG    2005 new_sp_m014   151
## 10 Afghanistan AF    AFG    2006 new_sp_m014   193
## # … with 76,036 more rows

(who3 <- who2%>%
    separate(key, c("new", 'type', 'sexage'), sep = '_'))

## # A tibble: 76,046 x 8
##    country     iso2  iso3   year new   type  sexage cases
##    <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int>
##  1 Afghanistan AF    AFG    1997 new   sp    m014       0
##  2 Afghanistan AF    AFG    1998 new   sp    m014      30
##  3 Afghanistan AF    AFG    1999 new   sp    m014       8
##  4 Afghanistan AF    AFG    2000 new   sp    m014      52
##  5 Afghanistan AF    AFG    2001 new   sp    m014     129
##  6 Afghanistan AF    AFG    2002 new   sp    m014      90
##  7 Afghanistan AF    AFG    2003 new   sp    m014     127
##  8 Afghanistan AF    AFG    2004 new   sp    m014     139
##  9 Afghanistan AF    AFG    2005 new   sp    m014     151
## 10 Afghanistan AF    AFG    2006 new   sp    m014     193
## # … with 76,036 more rows

who3 %>%count(new)

## # A tibble: 1 x 2
##   new       n
## * <chr> <int>
## 1 new   76046

(who4<- who3%>%
    select(-new, -iso2, -iso3))

## # A tibble: 76,046 x 5
##    country      year type  sexage cases
##    <chr>       <int> <chr> <chr>  <int>
##  1 Afghanistan  1997 sp    m014       0
##  2 Afghanistan  1998 sp    m014      30
##  3 Afghanistan  1999 sp    m014       8
##  4 Afghanistan  2000 sp    m014      52
##  5 Afghanistan  2001 sp    m014     129
##  6 Afghanistan  2002 sp    m014      90
##  7 Afghanistan  2003 sp    m014     127
##  8 Afghanistan  2004 sp    m014     139
##  9 Afghanistan  2005 sp    m014     151
## 10 Afghanistan  2006 sp    m014     193
## # … with 76,036 more rows

(who5 <- who4%>%
    separate(sexage, c('sex', 'age'), sep = 1))

## # A tibble: 76,046 x 6
##    country      year type  sex   age   cases
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1998 sp    m     014      30
##  3 Afghanistan  1999 sp    m     014       8
##  4 Afghanistan  2000 sp    m     014      52
##  5 Afghanistan  2001 sp    m     014     129
##  6 Afghanistan  2002 sp    m     014      90
##  7 Afghanistan  2003 sp    m     014     127
##  8 Afghanistan  2004 sp    m     014     139
##  9 Afghanistan  2005 sp    m     014     151
## 10 Afghanistan  2006 sp    m     014     193
## # … with 76,036 more rows

who%>%
  gather(code, value, new_sp_m014:newrel_f65, na.rm = T)%>%
  mutate(code=stringr::str_replace(code, 'newrel', 'new_rel'))%>%
      separate(code, c('new', 'var', 'sexage'))%>%
        select(-new, -iso2, -iso3)%>%
        separate(sexage, c('sex', 'age'), sep = 1)

## # A tibble: 76,046 x 6
##    country      year var   sex   age   value
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1998 sp    m     014      30
##  3 Afghanistan  1999 sp    m     014       8
##  4 Afghanistan  2000 sp    m     014      52
##  5 Afghanistan  2001 sp    m     014     129
##  6 Afghanistan  2002 sp    m     014      90
##  7 Afghanistan  2003 sp    m     014     127
##  8 Afghanistan  2004 sp    m     014     139
##  9 Afghanistan  2005 sp    m     014     151
## 10 Afghanistan  2006 sp    m     014     193
## # … with 76,036 more rows

who%>%
  gather(code, value, new_sp_m014:newrel_f65, na.rm=TRUE)

## # A tibble: 76,046 x 6
##    country     iso2  iso3   year code        value
##    <chr>       <chr> <chr> <int> <chr>       <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014     0
##  2 Afghanistan AF    AFG    1998 new_sp_m014    30
##  3 Afghanistan AF    AFG    1999 new_sp_m014     8
##  4 Afghanistan AF    AFG    2000 new_sp_m014    52
##  5 Afghanistan AF    AFG    2001 new_sp_m014   129
##  6 Afghanistan AF    AFG    2002 new_sp_m014    90
##  7 Afghanistan AF    AFG    2003 new_sp_m014   127
##  8 Afghanistan AF    AFG    2004 new_sp_m014   139
##  9 Afghanistan AF    AFG    2005 new_sp_m014   151
## 10 Afghanistan AF    AFG    2006 new_sp_m014   193
## # … with 76,036 more rows