Part 6 Week 2 Synchronous

6.1 Goal

  • Warm-up
  • Transform data in action
  • Tidy data in action

Examples are from https://r4ds.had.co.nz/index.html

6.2 Warm-up

6.2.1 Introduce short cut

insert chunk
option + command + I (MAC)
control + alt + I (Windows)

%>%
cmd/control + shift + M

<-
option + - (MAC)
alt + - (Windows)

help in R
?function_name

6.2.2 Subsetting

library(tidyverse)
set.seed(1234)
df <- tibble(
  x = runif(5),
  y = rnorm(5)
)

df
#> # A tibble: 5 x 2
#>       x       y
#>   <dbl>   <dbl>
#> 1 0.114  0.359 
#> 2 0.622 -0.730 
#> 3 0.609  0.0357
#> 4 0.623  0.113 
#> 5 0.861  1.43

6.2.2.1 I want to extract column x.

# Extract by name
df$x
#> [1] 0.1137 0.6223 0.6093 0.6234 0.8609
df[["x"]]
#> [1] 0.1137 0.6223 0.6093 0.6234 0.8609
df %>% .$x
#> [1] 0.1137 0.6223 0.6093 0.6234 0.8609
df %>% .[["x"]]
#> [1] 0.1137 0.6223 0.6093 0.6234 0.8609
df %>% pull(x)
#> [1] 0.1137 0.6223 0.6093 0.6234 0.8609
# Extract by position
df[[1]]
#> [1] 0.1137 0.6223 0.6093 0.6234 0.8609

6.3 Transform data with dplyr

We will use data from the nycflights13 package.

install.packages("nycflights13")

After we installed it, let’s import it.

library(nycflights13)

Here is the data.

flights
#> # A tibble: 336,776 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013     1     1      517            515         2      830
#> 2  2013     1     1      533            529         4      850
#> 3  2013     1     1      542            540         2      923
#> 4  2013     1     1      544            545        -1     1004
#> 5  2013     1     1      554            600        -6      812
#> 6  2013     1     1      554            558        -4      740
#> # … with 336,770 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>

This data frame contains all 336,776 flights that departed from New York City in 2013.

To know more about it, use help function.

?flights

int stands for integers
dbl stands for doubles, or real numbers
chr stands for character vectors, or strings
dttm stands for date-times (a date + a time)

6.3.1 Pick rows (observations): filter()

?filter

6.3.1.1 What are all flights on January 1st?

flights %>% 
  filter(month == 1, day == 1)
#> # A tibble: 842 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013     1     1      517            515         2      830
#> 2  2013     1     1      533            529         4      850
#> 3  2013     1     1      542            540         2      923
#> 4  2013     1     1      544            545        -1     1004
#> 5  2013     1     1      554            600        -6      812
#> 6  2013     1     1      554            558        -4      740
#> # … with 836 more rows, and 12 more variables: sched_arr_time <int>,
#> #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>

6.3.1.2 What are all flights on December 25th?

flights %>% 
  filter(month == 12, day ==25)
#> # A tibble: 719 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013    12    25      456            500        -4      649
#> 2  2013    12    25      524            515         9      805
#> 3  2013    12    25      542            540         2      832
#> 4  2013    12    25      546            550        -4     1022
#> 5  2013    12    25      556            600        -4      730
#> 6  2013    12    25      557            600        -3      743
#> # … with 713 more rows, and 12 more variables: sched_arr_time <int>,
#> #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>

6.3.1.3 What are all flights that departed in November or December?

Here we need some logical operators

& is “and,” | is “or,” and ! is “not.”

flights %>% 
  filter(month == 11 | month == 12)
#> # A tibble: 55,403 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013    11     1        5           2359         6      352
#> 2  2013    11     1       35           2250       105      123
#> 3  2013    11     1      455            500        -5      641
#> 4  2013    11     1      539            545        -6      856
#> 5  2013    11     1      542            545        -3      831
#> 6  2013    11     1      549            600       -11      912
#> # … with 55,397 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>

Another way is to use %in%, this can save some writing。

flights %>% 
  filter(month %in% c(11, 12))
#> # A tibble: 55,403 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013    11     1        5           2359         6      352
#> 2  2013    11     1       35           2250       105      123
#> 3  2013    11     1      455            500        -5      641
#> 4  2013    11     1      539            545        -6      856
#> 5  2013    11     1      542            545        -3      831
#> 6  2013    11     1      549            600       -11      912
#> # … with 55,397 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>

6.3.2 Reorder rows: arrange()

arrange() works similarly to filter() except that instead of selecting rows, it changes their order.

6.3.2.1 I want the table ordered by date (year, month, day).

flights %>% 
  arrange(year, month, day)
#> # A tibble: 336,776 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013     1     1      517            515         2      830
#> 2  2013     1     1      533            529         4      850
#> 3  2013     1     1      542            540         2      923
#> 4  2013     1     1      544            545        -1     1004
#> 5  2013     1     1      554            600        -6      812
#> 6  2013     1     1      554            558        -4      740
#> # … with 336,770 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>

6.3.2.2 I want the table ordered by ascending departure delay.

flights %>% 
  arrange(dep_delay)
#> # A tibble: 336,776 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013    12     7     2040           2123       -43       40
#> 2  2013     2     3     2022           2055       -33     2240
#> 3  2013    11    10     1408           1440       -32     1549
#> 4  2013     1    11     1900           1930       -30     2233
#> 5  2013     1    29     1703           1730       -27     1947
#> 6  2013     8     9      729            755       -26     1002
#> # … with 336,770 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>

6.3.2.3 I want the table ordered by descending departure delay.

flights %>% 
  arrange(desc(dep_delay))
#> # A tibble: 336,776 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013     1     9      641            900      1301     1242
#> 2  2013     6    15     1432           1935      1137     1607
#> 3  2013     1    10     1121           1635      1126     1239
#> 4  2013     9    20     1139           1845      1014     1457
#> 5  2013     7    22      845           1600      1005     1044
#> 6  2013     4    10     1100           1900       960     1342
#> # … with 336,770 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>

6.3.3 Pick columns (variables): select()

6.3.3.1 I only need the column of year, month, day, and origin, dest

flights %>% 
  select(year, month, day, origin, dest)
#> # A tibble: 336,776 x 5
#>    year month   day origin dest 
#>   <int> <int> <int> <chr>  <chr>
#> 1  2013     1     1 EWR    IAH  
#> 2  2013     1     1 LGA    IAH  
#> 3  2013     1     1 JFK    MIA  
#> 4  2013     1     1 JFK    BQN  
#> 5  2013     1     1 LGA    ATL  
#> 6  2013     1     1 EWR    ORD  
#> # … with 336,770 more rows
flights %>% 
  select(year:day, origin, dest)
#> # A tibble: 336,776 x 5
#>    year month   day origin dest 
#>   <int> <int> <int> <chr>  <chr>
#> 1  2013     1     1 EWR    IAH  
#> 2  2013     1     1 LGA    IAH  
#> 3  2013     1     1 JFK    MIA  
#> 4  2013     1     1 JFK    BQN  
#> 5  2013     1     1 LGA    ATL  
#> 6  2013     1     1 EWR    ORD  
#> # … with 336,770 more rows

6.3.3.2 I want the time_hour to be the first column

flights %>% 
  select(time_hour, everything())
#> # A tibble: 336,776 x 19
#>   time_hour            year month   day dep_time sched_dep_time
#>   <dttm>              <int> <int> <int>    <int>          <int>
#> 1 2013-01-01 05:00:00  2013     1     1      517            515
#> 2 2013-01-01 05:00:00  2013     1     1      533            529
#> 3 2013-01-01 05:00:00  2013     1     1      542            540
#> 4 2013-01-01 05:00:00  2013     1     1      544            545
#> 5 2013-01-01 06:00:00  2013     1     1      554            600
#> 6 2013-01-01 05:00:00  2013     1     1      554            558
#> # … with 336,770 more rows, and 13 more variables: dep_delay <dbl>,
#> #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
#> #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> #   minute <dbl>

6.3.4 Make new columns (variables): mutate()

Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. That’s the job of mutate().

6.3.4.1 I want the speed of each flight.

flights %>% 
  select(distance, air_time) %>% 
  mutate(speed_miles_minute = distance / air_time,
         speed_miles_hour = speed_miles_minute * 60)
#> # A tibble: 336,776 x 4
#>   distance air_time speed_miles_minute speed_miles_hour
#>      <dbl>    <dbl>              <dbl>            <dbl>
#> 1     1400      227               6.17             370.
#> 2     1416      227               6.24             374.
#> 3     1089      160               6.81             408.
#> 4     1576      183               8.61             517.
#> 5      762      116               6.57             394.
#> 6      719      150               4.79             288.
#> # … with 336,770 more rows

6.3.5 Collapse many values down to a single summary: summarise()

6.3.5.1 What is the average departure delay over the year?

flights %>% 
  summarise(delay = mean(dep_delay))
#> # A tibble: 1 x 1
#>   delay
#>   <dbl>
#> 1    NA
flights %>% 
  summarise(delay = mean(dep_delay, na.rm = TRUE))
#> # A tibble: 1 x 1
#>   delay
#>   <dbl>
#> 1  12.6

6.3.5.2 What is the average departure delay for each month?

Functions will be automatically applied “by group.”

flights %>% 
  group_by(month) %>% 
  summarise(delay_month = mean(dep_delay))
#> # A tibble: 12 x 2
#>   month delay_month
#>   <int>       <dbl>
#> 1     1          NA
#> 2     2          NA
#> 3     3          NA
#> 4     4          NA
#> 5     5          NA
#> 6     6          NA
#> # … with 6 more rows
flights %>% 
  group_by(month) %>% 
  summarise(delay_month = mean(dep_delay, na.rm = TRUE))
#> # A tibble: 12 x 2
#>   month delay_month
#>   <int>       <dbl>
#> 1     1        10.0
#> 2     2        10.8
#> 3     3        13.2
#> 4     4        13.9
#> 5     5        13.0
#> 6     6        20.8
#> # … with 6 more rows

6.3.5.3 What is the number of flights for each month?

flights %>% 
  group_by(month) %>% 
  summarise(n = n())
#> # A tibble: 12 x 2
#>   month     n
#>   <int> <int>
#> 1     1 27004
#> 2     2 24951
#> 3     3 28834
#> 4     4 28330
#> 5     5 28796
#> 6     6 28243
#> # … with 6 more rows

6.3.5.4 Exercise: What is the number of canceled flights for each month?

Missing values in dep_delay, arr_delay represent cancelled flights.

flights %>%
  group_by(month) %>%
  summarise(all = n(),
            canceled = sum(is.na(dep_delay) & is.na(arr_delay)))
#> # A tibble: 12 x 3
#>   month   all canceled
#>   <int> <int>    <int>
#> 1     1 27004      521
#> 2     2 24951     1261
#> 3     3 28834      861
#> 4     4 28330      668
#> 5     5 28796      563
#> 6     6 28243     1009
#> # … with 6 more rows

6.4 Tidy data with tidyr

  1. Each variable must have its own column.
  2. Each observation must have its own row.
  3. Each value must have its own cell.
table1
#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

6.4.1 Reshape data

6.4.1.1 pivot_longer()

Look at table4a

  • The column names 1999 and 2000 represent values of the year variable.
  • The values in the 1999 and 2000 columns represent values of the cases variable, and each row represents two observations, not one.
table4a
#> # A tibble: 3 x 3
#>   country     `1999` `2000`
#> * <chr>        <int>  <int>
#> 1 Afghanistan    745   2666
#> 2 Brazil       37737  80488
#> 3 China       212258 213766
table4a %>% 
  pivot_longer(cols = c(`1999`, `2000`),
               names_to = "year",
               values_to = "cases")
#> # A tibble: 6 x 3
#>   country     year   cases
#>   <chr>       <chr>  <int>
#> 1 Afghanistan 1999     745
#> 2 Afghanistan 2000    2666
#> 3 Brazil      1999   37737
#> 4 Brazil      2000   80488
#> 5 China       1999  212258
#> 6 China       2000  213766
table4b
#> # A tibble: 3 x 3
#>   country         `1999`     `2000`
#> * <chr>            <int>      <int>
#> 1 Afghanistan   19987071   20595360
#> 2 Brazil       172006362  174504898
#> 3 China       1272915272 1280428583
table4b %>% 
  pivot_longer(cols = c(`1999`, `2000`),
               names_to = "year",
               values_to = "population")
#> # A tibble: 6 x 3
#>   country     year  population
#>   <chr>       <chr>      <int>
#> 1 Afghanistan 1999    19987071
#> 2 Afghanistan 2000    20595360
#> 3 Brazil      1999   172006362
#> 4 Brazil      2000   174504898
#> 5 China       1999  1272915272
#> 6 China       2000  1280428583

6.4.1.2 pivot_wider()

One observation is scattered across multiple rows.

Look table2: an observation is a country in a year, but each observation is spread across two rows.

table2
#> # A tibble: 12 x 4
#>   country      year type           count
#>   <chr>       <int> <chr>          <int>
#> 1 Afghanistan  1999 cases            745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000 cases           2666
#> 4 Afghanistan  2000 population  20595360
#> 5 Brazil       1999 cases          37737
#> 6 Brazil       1999 population 172006362
#> # … with 6 more rows
?pivot_wider
table2 %>% 
  pivot_wider(names_from = type, 
              values_from = count)
#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

They are complement

6.4.2 Split cells

6.4.2.1 separate()

?separate

table3: one column (rate) that contains two variables (cases and population).

table3
#> # A tibble: 6 x 3
#>   country      year rate             
#> * <chr>       <int> <chr>            
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583

separate() pulls apart one column into multiple columns, by splitting wherever a separator character appears.

table3 %>% 
  separate(col = rate,
           into = c("cases", "population"))
#> # A tibble: 6 x 4
#>   country      year cases  population
#>   <chr>       <int> <chr>  <chr>     
#> 1 Afghanistan  1999 745    19987071  
#> 2 Afghanistan  2000 2666   20595360  
#> 3 Brazil       1999 37737  172006362 
#> 4 Brazil       2000 80488  174504898 
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583
table3 %>% 
  separate(col = rate,
           into = c("cases", "population"),
           sep = '/')
#> # A tibble: 6 x 4
#>   country      year cases  population
#>   <chr>       <int> <chr>  <chr>     
#> 1 Afghanistan  1999 745    19987071  
#> 2 Afghanistan  2000 2666   20595360  
#> 3 Brazil       1999 37737  172006362 
#> 4 Brazil       2000 80488  174504898 
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

Convert chr to int

table3 %>% 
  separate(col = rate,
           into = c("cases", "population"),
           sep = '/',
           convert = TRUE)
#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

6.4.2.2 unite()

?unite
table5
#> # A tibble: 6 x 4
#>   country     century year  rate             
#> * <chr>       <chr>   <chr> <chr>            
#> 1 Afghanistan 19      99    745/19987071     
#> 2 Afghanistan 20      00    2666/20595360    
#> 3 Brazil      19      99    37737/172006362  
#> 4 Brazil      20      00    80488/174504898  
#> 5 China       19      99    212258/1272915272
#> 6 China       20      00    213766/1280428583
table5 %>% 
  unite(new, century, year)
#> # A tibble: 6 x 3
#>   country     new   rate             
#>   <chr>       <chr> <chr>            
#> 1 Afghanistan 19_99 745/19987071     
#> 2 Afghanistan 20_00 2666/20595360    
#> 3 Brazil      19_99 37737/172006362  
#> 4 Brazil      20_00 80488/174504898  
#> 5 China       19_99 212258/1272915272
#> 6 China       20_00 213766/1280428583

Here we don’t want separator "_" so we use "":

table5 %>% 
  unite(new, century, year, sep = "")
#> # A tibble: 6 x 3
#>   country     new   rate             
#>   <chr>       <chr> <chr>            
#> 1 Afghanistan 1999  745/19987071     
#> 2 Afghanistan 2000  2666/20595360    
#> 3 Brazil      1999  37737/172006362  
#> 4 Brazil      2000  80488/174504898  
#> 5 China       1999  212258/1272915272
#> 6 China       2000  213766/1280428583

6.4.3 Missing values

A value can be missing in one of two possible ways:

Explicitly, i.e. flagged with NA.
Implicitly, i.e. simply not present in the data.

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

stocks
#> # A tibble: 7 x 3
#>    year   qtr return
#>   <dbl> <dbl>  <dbl>
#> 1  2015     1   1.88
#> 2  2015     2   0.59
#> 3  2015     3   0.35
#> 4  2015     4  NA   
#> 5  2016     2   0.92
#> 6  2016     3   0.17
#> # … with 1 more row

6.4.3.1 drop_na()

stocks %>% drop_na()
#> # A tibble: 6 x 3
#>    year   qtr return
#>   <dbl> <dbl>  <dbl>
#> 1  2015     1   1.88
#> 2  2015     2   0.59
#> 3  2015     3   0.35
#> 4  2016     2   0.92
#> 5  2016     3   0.17
#> 6  2016     4   2.66

6.4.3.2 fill()

?fill
treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

treatment
#> # A tibble: 4 x 3
#>   person           treatment response
#>   <chr>                <dbl>    <dbl>
#> 1 Derrick Whitmore         1        7
#> 2 <NA>                     2       10
#> 3 <NA>                     3        9
#> 4 Katherine Burke          1        4
treatment %>% 
  fill(person)
#> # A tibble: 4 x 3
#>   person           treatment response
#>   <chr>                <dbl>    <dbl>
#> 1 Derrick Whitmore         1        7
#> 2 Derrick Whitmore         2       10
#> 3 Derrick Whitmore         3        9
#> 4 Katherine Burke          1        4

6.4.4 Case study

The who dataset contains tuberculosis (TB) cases broken down by year, country, age, gender, and diagnosis method. The data comes from the 2014 World Health Organization Global Tuberculosis Report.

6.4.4.1 Look at data

who
#> # A tibble: 7,240 x 60
#>   country     iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534
#>   <chr>       <chr> <chr> <int>       <int>        <int>        <int>
#> 1 Afghanistan AF    AFG    1980          NA           NA           NA
#> 2 Afghanistan AF    AFG    1981          NA           NA           NA
#> 3 Afghanistan AF    AFG    1982          NA           NA           NA
#> 4 Afghanistan AF    AFG    1983          NA           NA           NA
#> 5 Afghanistan AF    AFG    1984          NA           NA           NA
#> 6 Afghanistan AF    AFG    1985          NA           NA           NA
#> # … with 7,234 more rows, and 53 more variables: new_sp_m3544 <int>,
#> #   new_sp_m4554 <int>, new_sp_m5564 <int>, new_sp_m65 <int>,
#> #   new_sp_f014 <int>, new_sp_f1524 <int>, new_sp_f2534 <int>,
#> #   new_sp_f3544 <int>, new_sp_f4554 <int>, new_sp_f5564 <int>,
#> #   new_sp_f65 <int>, new_sn_m014 <int>, new_sn_m1524 <int>,
#> #   new_sn_m2534 <int>, new_sn_m3544 <int>, new_sn_m4554 <int>,
#> #   new_sn_m5564 <int>, new_sn_m65 <int>, new_sn_f014 <int>,
#> #   new_sn_f1524 <int>, new_sn_f2534 <int>, new_sn_f3544 <int>,
#> #   new_sn_f4554 <int>, new_sn_f5564 <int>, new_sn_f65 <int>,
#> #   new_ep_m014 <int>, new_ep_m1524 <int>, new_ep_m2534 <int>,
#> #   new_ep_m3544 <int>, new_ep_m4554 <int>, new_ep_m5564 <int>,
#> #   new_ep_m65 <int>, new_ep_f014 <int>, new_ep_f1524 <int>,
#> #   new_ep_f2534 <int>, new_ep_f3544 <int>, new_ep_f4554 <int>,
#> #   new_ep_f5564 <int>, new_ep_f65 <int>, newrel_m014 <int>,
#> #   newrel_m1524 <int>, newrel_m2534 <int>, newrel_m3544 <int>,
#> #   newrel_m4554 <int>, newrel_m5564 <int>, newrel_m65 <int>,
#> #   newrel_f014 <int>, newrel_f1524 <int>, newrel_f2534 <int>,
#> #   newrel_f3544 <int>, newrel_f4554 <int>, newrel_f5564 <int>,
#> #   newrel_f65 <int>
?who
colnames(who) # look at all column names
#>  [1] "country"      "iso2"         "iso3"         "year"        
#>  [5] "new_sp_m014"  "new_sp_m1524" "new_sp_m2534" "new_sp_m3544"
#>  [9] "new_sp_m4554" "new_sp_m5564" "new_sp_m65"   "new_sp_f014" 
#> [13] "new_sp_f1524" "new_sp_f2534" "new_sp_f3544" "new_sp_f4554"
#> [17] "new_sp_f5564" "new_sp_f65"   "new_sn_m014"  "new_sn_m1524"
#> [21] "new_sn_m2534" "new_sn_m3544" "new_sn_m4554" "new_sn_m5564"
#> [25] "new_sn_m65"   "new_sn_f014"  "new_sn_f1524" "new_sn_f2534"
#> [29] "new_sn_f3544" "new_sn_f4554" "new_sn_f5564" "new_sn_f65"  
#> [33] "new_ep_m014"  "new_ep_m1524" "new_ep_m2534" "new_ep_m3544"
#> [37] "new_ep_m4554" "new_ep_m5564" "new_ep_m65"   "new_ep_f014" 
#> [41] "new_ep_f1524" "new_ep_f2534" "new_ep_f3544" "new_ep_f4554"
#> [45] "new_ep_f5564" "new_ep_f65"   "newrel_m014"  "newrel_m1524"
#> [49] "newrel_m2534" "newrel_m3544" "newrel_m4554" "newrel_m5564"
#> [53] "newrel_m65"   "newrel_f014"  "newrel_f1524" "newrel_f2534"
#> [57] "newrel_f3544" "newrel_f4554" "newrel_f5564" "newrel_f65"

6.4.4.2 Gather together the columns that are not variables

who %>% 
  pivot_longer(cols = new_sp_m014:newrel_f65,
               names_to = "key",
               values_to = "cases")
#> # A tibble: 405,440 x 6
#>   country     iso2  iso3   year key          cases
#>   <chr>       <chr> <chr> <int> <chr>        <int>
#> 1 Afghanistan AF    AFG    1980 new_sp_m014     NA
#> 2 Afghanistan AF    AFG    1980 new_sp_m1524    NA
#> 3 Afghanistan AF    AFG    1980 new_sp_m2534    NA
#> 4 Afghanistan AF    AFG    1980 new_sp_m3544    NA
#> 5 Afghanistan AF    AFG    1980 new_sp_m4554    NA
#> 6 Afghanistan AF    AFG    1980 new_sp_m5564    NA
#> # … with 405,434 more rows

There are a lot of missing values in the current representation, so for now we’ll use values_drop_na just so we can focus on the values that are present.

who1 <- who %>% 
  pivot_longer(cols = new_sp_m014:newrel_f65,
               names_to = "key",
               values_to = "cases",
               values_drop_na = TRUE)

who1
#> # A tibble: 76,046 x 6
#>   country     iso2  iso3   year key          cases
#>   <chr>       <chr> <chr> <int> <chr>        <int>
#> 1 Afghanistan AF    AFG    1997 new_sp_m014      0
#> 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
#> 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
#> 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
#> 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
#> 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
#> # … with 76,040 more rows

6.4.4.3 Formalize key column

who2 <- who1 %>% 
  mutate(key = str_replace(key, "newrel", "new_rel"))
who2
#> # A tibble: 76,046 x 6
#>   country     iso2  iso3   year key          cases
#>   <chr>       <chr> <chr> <int> <chr>        <int>
#> 1 Afghanistan AF    AFG    1997 new_sp_m014      0
#> 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
#> 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
#> 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
#> 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
#> 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
#> # … with 76,040 more rows

6.4.4.4 Separate key column

who3 <- who2 %>% 
  separate(key, 
           c("new", "type", "sexage"), 
           sep = "_")

who3
#> # A tibble: 76,046 x 8
#>   country     iso2  iso3   year new   type  sexage cases
#>   <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int>
#> 1 Afghanistan AF    AFG    1997 new   sp    m014       0
#> 2 Afghanistan AF    AFG    1997 new   sp    m1524     10
#> 3 Afghanistan AF    AFG    1997 new   sp    m2534      6
#> 4 Afghanistan AF    AFG    1997 new   sp    m3544      3
#> 5 Afghanistan AF    AFG    1997 new   sp    m4554      5
#> 6 Afghanistan AF    AFG    1997 new   sp    m5564      2
#> # … with 76,040 more rows

6.4.4.5 Remove useless columns

who4 <- who3 %>% 
  select(-c(iso2, iso3, new))
who4
#> # A tibble: 76,046 x 5
#>   country      year type  sexage cases
#>   <chr>       <int> <chr> <chr>  <int>
#> 1 Afghanistan  1997 sp    m014       0
#> 2 Afghanistan  1997 sp    m1524     10
#> 3 Afghanistan  1997 sp    m2534      6
#> 4 Afghanistan  1997 sp    m3544      3
#> 5 Afghanistan  1997 sp    m4554      5
#> 6 Afghanistan  1997 sp    m5564      2
#> # … with 76,040 more rows

6.4.4.6 Separate sex and age

who4 %>% 
  separate(sexage, c("sex", "age"), sep = 1)
#> # A tibble: 76,046 x 6
#>   country      year type  sex   age   cases
#>   <chr>       <int> <chr> <chr> <chr> <int>
#> 1 Afghanistan  1997 sp    m     014       0
#> 2 Afghanistan  1997 sp    m     1524     10
#> 3 Afghanistan  1997 sp    m     2534      6
#> 4 Afghanistan  1997 sp    m     3544      3
#> 5 Afghanistan  1997 sp    m     4554      5
#> 6 Afghanistan  1997 sp    m     5564      2
#> # … with 76,040 more rows
# pipeline
who %>% 
  pivot_longer(cols = new_sp_m014:newrel_f65,
               names_to = "key",
               values_to = "cases",
               values_drop_na = TRUE) %>% 
  mutate(key = str_replace(key, "newrel", "new_rel")) %>% 
  separate(key, c("new", "type", "sexage"), sep = "_") %>% 
  select(-c(iso2, iso3, new)) %>% 
  separate(sexage, c("sex", "age"), sep = 1)
#> # A tibble: 76,046 x 6
#>   country      year type  sex   age   cases
#>   <chr>       <int> <chr> <chr> <chr> <int>
#> 1 Afghanistan  1997 sp    m     014       0
#> 2 Afghanistan  1997 sp    m     1524     10
#> 3 Afghanistan  1997 sp    m     2534      6
#> 4 Afghanistan  1997 sp    m     3544      3
#> 5 Afghanistan  1997 sp    m     4554      5
#> 6 Afghanistan  1997 sp    m     5564      2
#> # … with 76,040 more rows