2.5 Data Manipulation

# load packages
library(tidyverse)
library(lubridate)


x <- c(1, 4, 23, 4, 45)
n <- c(1, 3, 5)
g <- c("M", "M", "F")
df <- data.frame(n, g)
df
#>   n g
#> 1 1 M
#> 2 3 M
#> 3 5 F
str(df)
#> 'data.frame':    3 obs. of  2 variables:
#>  $ n: num  1 3 5
#>  $ g: chr  "M" "M" "F"

#Similarly
df <- tibble(n, g)
df
#> # A tibble: 3 × 2
#>       n g    
#>   <dbl> <chr>
#> 1     1 M    
#> 2     3 M    
#> 3     5 F
str(df)
#> tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
#>  $ n: num [1:3] 1 3 5
#>  $ g: chr [1:3] "M" "M" "F"

# list form
lst <- list(x, n, g, df)
lst
#> [[1]]
#> [1]  1  4 23  4 45
#> 
#> [[2]]
#> [1] 1 3 5
#> 
#> [[3]]
#> [1] "M" "M" "F"
#> 
#> [[4]]
#> # A tibble: 3 × 2
#>       n g    
#>   <dbl> <chr>
#> 1     1 M    
#> 2     3 M    
#> 3     5 F

# Or
lst2 <- list(num = x, size = n, sex = g, data = df)
lst2
#> $num
#> [1]  1  4 23  4 45
#> 
#> $size
#> [1] 1 3 5
#> 
#> $sex
#> [1] "M" "M" "F"
#> 
#> $data
#> # A tibble: 3 × 2
#>       n g    
#>   <dbl> <chr>
#> 1     1 M    
#> 2     3 M    
#> 3     5 F

# Or
lst3 <- list(x = c(1, 3, 5, 7),
             y = c(2, 2, 2, 4, 5, 5, 5, 6),
             z = c(22, 3, 3, 3, 5, 10))
lst3
#> $x
#> [1] 1 3 5 7
#> 
#> $y
#> [1] 2 2 2 4 5 5 5 6
#> 
#> $z
#> [1] 22  3  3  3  5 10

# find the means of x, y, z.

# can do one at a time
mean(lst3$x)
#> [1] 4
mean(lst3$y)
#> [1] 3.875
mean(lst3$z)
#> [1] 7.666667

# list apply
lapply(lst3, mean)
#> $x
#> [1] 4
#> 
#> $y
#> [1] 3.875
#> 
#> $z
#> [1] 7.666667

# OR
sapply(lst3, mean)
#>        x        y        z 
#> 4.000000 3.875000 7.666667

# Or, tidyverse function map() 
map(lst3, mean)
#> $x
#> [1] 4
#> 
#> $y
#> [1] 3.875
#> 
#> $z
#> [1] 7.666667

# The tidyverse requires a modified map function called map_dbl()
map_dbl(lst3, mean)
#>        x        y        z 
#> 4.000000 3.875000 7.666667


# Binding 
dat01 <- tibble(x = 1:5, y = 5:1)
dat01
#> # A tibble: 5 × 2
#>       x     y
#>   <int> <int>
#> 1     1     5
#> 2     2     4
#> 3     3     3
#> 4     4     2
#> 5     5     1
dat02 <- tibble(x = 10:16, y = x/2)
dat02
#> # A tibble: 7 × 2
#>       x     y
#>   <int> <dbl>
#> 1    10   5  
#> 2    11   5.5
#> 3    12   6  
#> 4    13   6.5
#> 5    14   7  
#> 6    15   7.5
#> 7    16   8
dat03 <- tibble(z = runif(5)) # 5 random numbers from interval (0,1)
dat03
#> # A tibble: 5 × 1
#>        z
#>    <dbl>
#> 1 0.508 
#> 2 0.0889
#> 3 0.207 
#> 4 0.584 
#> 5 0.640

# row binding
bind_rows(dat01, dat02, dat01)
#> # A tibble: 17 × 2
#>        x     y
#>    <int> <dbl>
#>  1     1   5  
#>  2     2   4  
#>  3     3   3  
#>  4     4   2  
#>  5     5   1  
#>  6    10   5  
#>  7    11   5.5
#>  8    12   6  
#>  9    13   6.5
#> 10    14   7  
#> 11    15   7.5
#> 12    16   8  
#> 13     1   5  
#> 14     2   4  
#> 15     3   3  
#> 16     4   2  
#> 17     5   1

# use ".id" argument to create a new column 
# that contains an identifier for the original data.
bind_rows(dat01, dat02, .id = "id")
#> # A tibble: 12 × 3
#>    id        x     y
#>    <chr> <int> <dbl>
#>  1 1         1   5  
#>  2 1         2   4  
#>  3 1         3   3  
#>  4 1         4   2  
#>  5 1         5   1  
#>  6 2        10   5  
#>  7 2        11   5.5
#>  8 2        12   6  
#>  9 2        13   6.5
#> 10 2        14   7  
#> 11 2        15   7.5
#> 12 2        16   8

# with name
bind_rows("dat01" = dat01, "dat02" = dat02, .id = "id")
#> # A tibble: 12 × 3
#>    id        x     y
#>    <chr> <int> <dbl>
#>  1 dat01     1   5  
#>  2 dat01     2   4  
#>  3 dat01     3   3  
#>  4 dat01     4   2  
#>  5 dat01     5   1  
#>  6 dat02    10   5  
#>  7 dat02    11   5.5
#>  8 dat02    12   6  
#>  9 dat02    13   6.5
#> 10 dat02    14   7  
#> 11 dat02    15   7.5
#> 12 dat02    16   8

# bind_rows() also works on lists of data frames
list01 <- list("dat01" = dat01, "dat02" = dat02)
list01
#> $dat01
#> # A tibble: 5 × 2
#>       x     y
#>   <int> <int>
#> 1     1     5
#> 2     2     4
#> 3     3     3
#> 4     4     2
#> 5     5     1
#> 
#> $dat02
#> # A tibble: 7 × 2
#>       x     y
#>   <int> <dbl>
#> 1    10   5  
#> 2    11   5.5
#> 3    12   6  
#> 4    13   6.5
#> 5    14   7  
#> 6    15   7.5
#> 7    16   8
bind_rows(list01)
#> # A tibble: 12 × 2
#>        x     y
#>    <int> <dbl>
#>  1     1   5  
#>  2     2   4  
#>  3     3   3  
#>  4     4   2  
#>  5     5   1  
#>  6    10   5  
#>  7    11   5.5
#>  8    12   6  
#>  9    13   6.5
#> 10    14   7  
#> 11    15   7.5
#> 12    16   8
bind_rows(list01, .id = "source")
#> # A tibble: 12 × 3
#>    source     x     y
#>    <chr>  <int> <dbl>
#>  1 dat01      1   5  
#>  2 dat01      2   4  
#>  3 dat01      3   3  
#>  4 dat01      4   2  
#>  5 dat01      5   1  
#>  6 dat02     10   5  
#>  7 dat02     11   5.5
#>  8 dat02     12   6  
#>  9 dat02     13   6.5
#> 10 dat02     14   7  
#> 11 dat02     15   7.5
#> 12 dat02     16   8

# The extended example below demonstrates how this can be very handy.

# column binding
bind_cols(dat01, dat03)
#> # A tibble: 5 × 3
#>       x     y      z
#>   <int> <int>  <dbl>
#> 1     1     5 0.508 
#> 2     2     4 0.0889
#> 3     3     3 0.207 
#> 4     4     2 0.584 
#> 5     5     1 0.640


# Regular expressions -----------------------------------------------------
names <- c("Ford, MS", "Jones, PhD", "Martin, Phd", "Huck, MA, MLS")

# pattern: first comma and everything after it
str_remove(names, pattern = ", [[:print:]]+")
#> [1] "Ford"   "Jones"  "Martin" "Huck"

# [[:print:]]+ = one or more printable characters


# Reshaping ---------------------------------------------------------------

# Example of a wide data frame. Notice each person has multiple test scores
# that span columns.
wide <- data.frame(name=c("Clay","Garrett","Addison"), 
                   test1=c(78, 93, 90), 
                   test2=c(87, 91, 97),
                   test3=c(88, 99, 91))
wide
#>      name test1 test2 test3
#> 1    Clay    78    87    88
#> 2 Garrett    93    91    99
#> 3 Addison    90    97    91

# Example of a long data frame. This is the same data as above, but in long
# format. We have one row per person per test.
long <- data.frame(name=rep(c("Clay","Garrett","Addison"),each=3),
                   test=rep(1:3, 3),
                   score=c(78, 87, 88, 93, 91, 99, 90, 97, 91))
long
#>      name test score
#> 1    Clay    1    78
#> 2    Clay    2    87
#> 3    Clay    3    88
#> 4 Garrett    1    93
#> 5 Garrett    2    91
#> 6 Garrett    3    99
#> 7 Addison    1    90
#> 8 Addison    2    97
#> 9 Addison    3    91

# mean score per student
aggregate(score ~ name, data = long, mean)
#>      name    score
#> 1 Addison 92.66667
#> 2    Clay 84.33333
#> 3 Garrett 94.33333
aggregate(score ~ test, data = long, mean)
#>   test    score
#> 1    1 87.00000
#> 2    2 91.66667
#> 3    3 92.66667

# line plot of scores over test, grouped by name
ggplot(long, aes(x = factor(test), y = score, color = name, group = name)) +
  geom_point() +
  geom_line() +
  xlab("Test")



#### reshape wide to long
pivot_longer(wide, test1:test3, names_to = "test", values_to = "score")
#> # A tibble: 9 × 3
#>   name    test  score
#>   <chr>   <chr> <dbl>
#> 1 Clay    test1    78
#> 2 Clay    test2    87
#> 3 Clay    test3    88
#> 4 Garrett test1    93
#> 5 Garrett test2    91
#> 6 Garrett test3    99
#> 7 Addison test1    90
#> 8 Addison test2    97
#> 9 Addison test3    91

# Or
pivot_longer(wide, -name, names_to = "test", values_to = "score")
#> # A tibble: 9 × 3
#>   name    test  score
#>   <chr>   <chr> <dbl>
#> 1 Clay    test1    78
#> 2 Clay    test2    87
#> 3 Clay    test3    88
#> 4 Garrett test1    93
#> 5 Garrett test2    91
#> 6 Garrett test3    99
#> 7 Addison test1    90
#> 8 Addison test2    97
#> 9 Addison test3    91

# drop "test" from the test column with names_prefix argument
pivot_longer(wide, -name, names_to = "test", values_to = "score", 
             names_prefix = "test")
#> # A tibble: 9 × 3
#>   name    test  score
#>   <chr>   <chr> <dbl>
#> 1 Clay    1        78
#> 2 Clay    2        87
#> 3 Clay    3        88
#> 4 Garrett 1        93
#> 5 Garrett 2        91
#> 6 Garrett 3        99
#> 7 Addison 1        90
#> 8 Addison 2        97
#> 9 Addison 3        91

#### reshape long to wide 
pivot_wider(long, name, names_from = test, values_from = score)
#> # A tibble: 3 × 4
#>   name      `1`   `2`   `3`
#>   <chr>   <dbl> <dbl> <dbl>
#> 1 Clay       78    87    88
#> 2 Garrett    93    91    99
#> 3 Addison    90    97    91

# using the names_prefix argument lets us prepend text to the column names.
pivot_wider(long, name, names_from = test, values_from = score,
            names_prefix = "test")
#> # A tibble: 3 × 4
#>   name    test1 test2 test3
#>   <chr>   <dbl> <dbl> <dbl>
#> 1 Clay       78    87    88
#> 2 Garrett    93    91    99
#> 3 Addison    90    97    91

The verbs of data manipulation

  • select: selecting (or not selecting) columns based on their names (eg: select columns Q1 through Q25)
  • slice: selecting (or not selecting) rows based on their position (eg: select rows 1:10)
  • mutate: add or derive new columns (or variables) based on existing columns (eg: create a new column that expresses measurement in cm based on existing measure in inches)
  • rename: rename variables or change column names (eg: change “GraduationRate100” to “grad100”)
  • filter: selecting rows based on a condition (eg: all rows where gender = Male)
  • arrange: ordering rows based on variable(s) numeric or alphabetical order (eg: sort in descending order of Income)
  • sample: take random samples of data (eg: sample 80% of data to create a “training” set)
  • summarize: condense or aggregate multiple values into single summary values (eg: calculate median income by age group)
  • group_by: convert a tbl into a grouped tbl so that operations are performed “by group”; allows us to summarize data or apply verbs to data by groups (eg, by gender or treatment)
  • the pipe: %>%
    • Use Ctrl + Shift + M (Win) or Cmd + Shift + M (Mac) to enter in RStudio

    • The pipe takes the output of a function and “pipes” into the first argument of the next function.

    • new pipe is |> It should be identical to the old one, except for certain special cases.

  • := (Walrus operator): similar to = , but for cases where you want to use the glue package (i.e., dynamic changes in the variable name in the left-hand side)

Writing function in R

Tunneling

{{ (called curly-curly) allows you to tunnel data-variables through arg-variables (i.e., function arguments)

library(tidyverse)

get_mean <- function(data, group_var, var_to_mean){
    data %>% 
        group_by({{group_var}}) %>% 
        summarize(mean = mean({{var_to_mean}}))
}

data("mtcars")
head(mtcars)
#>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
#> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

mtcars %>% 
    get_mean(group_var = cyl, var_to_mean = mpg)
#> # A tibble: 3 × 2
#>     cyl  mean
#>   <dbl> <dbl>
#> 1     4  26.7
#> 2     6  19.7
#> 3     8  15.1

# to change the resulting variable name dynamically, 
# you can use the glue interpolation (i.e., `{{`) and Walrus operator (`:=`)
get_mean <- function(data, group_var, var_to_mean, prefix = "mean_of"){
    data %>% 
        group_by({{group_var}}) %>% 
        summarize("{prefix}_{{var_to_mean}}" := mean({{var_to_mean}}))
}

mtcars %>% 
    get_mean(group_var = cyl, var_to_mean = mpg)
#> # A tibble: 3 × 2
#>     cyl mean_of_mpg
#>   <dbl>       <dbl>
#> 1     4        26.7
#> 2     6        19.7
#> 3     8        15.1