Chapter 9 Group Manipulation

Lander's chapter 11 Group Manipulation

###############################
#chapter 11 group manipulation#
###############################
library(tidyverse)
library(dplyr)
library(plyr)
#notes: pay attention to arguments orders
#in aggregate, aggregate(variable~group, data, function)
#in ddply, ddply(data, .variable, .function)
rm(list = ls())
#tapply, lapply, mapply

#apply must be used on matrix (same type)
#margin 1: operate over the rows; 2: operate over the columns
theMatrix <- matrix(1:9, nrow = 3)
theMatrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
apply(theMatrix, 1, sum)
## [1] 12 15 18
apply(theMatrix, 2, sum)
## [1]  6 15 24
rowSums(theMatrix)
## [1] 12 15 18
colSums(theMatrix)
## [1]  6 15 24
theMatrix[2,1] <- NA
apply(theMatrix, 1, sum)
## [1] 12 NA 18
apply(theMatrix, 1, sum, na.rm=T)
## [1] 12 13 18
rowSums(theMatrix)
## [1] 12 NA 18
rowSums(theMatrix, na.rm = T)
## [1] 12 13 18
#example used in my research (can write your own function)
#ties$noofsenders <- apply(ties[ ,11:15], MARGIN = 1,
#                          FUN = function(x) length(x[!is.na(x)]))

#lapply and sapply
#lapply is used in list; operate on each element
theList <- list(A=matrix(1:9, 3), B=1:5, C=matrix(1:4, 2), D=2)
lapply(theList, sum)
## $A
## [1] 45
## 
## $B
## [1] 15
## 
## $C
## [1] 10
## 
## $D
## [1] 2
#sapply return the result of lapply as a vector instead
sapply(theList, sum)
##  A  B  C  D 
## 45 15 10  2
theNames <- c("Jared", "Deb", "Paul")
lapply(theNames, nchar)
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 3
## 
## [[3]]
## [1] 4
#mapply applies a function to each element of multiple lists
firstList <- list(A=matrix(1:16, 4), B=matrix(1:16, 2), C=1:5)
secondList <- list(A=matrix(1:16, 4), B=matrix(1:16, 8), C=15:1)
mapply(identical, firstList, secondList)
##     A     B     C 
##  TRUE FALSE FALSE
#aggregate
#check the diamonds data
require(ggplot2)
data("diamonds")
head(diamonds)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
#first part: formula specifying how our data are organized; second: data; third:function
aggregate(price~cut, diamonds, mean)
##         cut    price
## 1      Fair 4358.758
## 2      Good 3928.864
## 3 Very Good 3981.760
## 4   Premium 4584.258
## 5     Ideal 3457.542
#aggregate by two variables
aggregate(price~cut+color, diamonds, mean)
##          cut color    price
## 1       Fair     D 4291.061
## 2       Good     D 3405.382
## 3  Very Good     D 3470.467
## 4    Premium     D 3631.293
## 5      Ideal     D 2629.095
## 6       Fair     E 3682.312
## 7       Good     E 3423.644
## 8  Very Good     E 3214.652
## 9    Premium     E 3538.914
## 10     Ideal     E 2597.550
## 11      Fair     F 3827.003
## 12      Good     F 3495.750
## 13 Very Good     F 3778.820
## 14   Premium     F 4324.890
## 15     Ideal     F 3374.939
## 16      Fair     G 4239.255
## 17      Good     G 4123.482
## 18 Very Good     G 3872.754
## 19   Premium     G 4500.742
## 20     Ideal     G 3720.706
## 21      Fair     H 5135.683
## 22      Good     H 4276.255
## 23 Very Good     H 4535.390
## 24   Premium     H 5216.707
## 25     Ideal     H 3889.335
## 26      Fair     I 4685.446
## 27      Good     I 5078.533
## 28 Very Good     I 5255.880
## 29   Premium     I 5946.181
## 30     Ideal     I 4451.970
## 31      Fair     J 4975.655
## 32      Good     J 4574.173
## 33 Very Good     J 5103.513
## 34   Premium     J 6294.592
## 35     Ideal     J 4918.186
#tidyverse
names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"   "x"       "y"      
## [10] "z"
pricestats <- diamonds %>%
  group_by(cut)
pricestats
## # A tibble: 53,940 x 10
## # Groups:   cut [5]
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 53,930 more rows
pricestats2 <- diamonds %>%
  group_by(cut, color)
pricestats2
## # A tibble: 53,940 x 10
## # Groups:   cut, color [35]
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # … with 53,930 more rows
#aggregate two variables
aggregate(cbind(price, carat)~cut, diamonds, mean)
##         cut    price     carat
## 1      Fair 4358.758 1.0461366
## 2      Good 3928.864 0.8491847
## 3 Very Good 3981.760 0.8063814
## 4   Premium 4584.258 0.8919549
## 5     Ideal 3457.542 0.7028370
aggregate(cbind(price, carat)~cut+color, diamonds, mean)
##          cut color    price     carat
## 1       Fair     D 4291.061 0.9201227
## 2       Good     D 3405.382 0.7445166
## 3  Very Good     D 3470.467 0.6964243
## 4    Premium     D 3631.293 0.7215471
## 5      Ideal     D 2629.095 0.5657657
## 6       Fair     E 3682.312 0.8566071
## 7       Good     E 3423.644 0.7451340
## 8  Very Good     E 3214.652 0.6763167
## 9    Premium     E 3538.914 0.7177450
## 10     Ideal     E 2597.550 0.5784012
## 11      Fair     F 3827.003 0.9047115
## 12      Good     F 3495.750 0.7759296
## 13 Very Good     F 3778.820 0.7409612
## 14   Premium     F 4324.890 0.8270356
## 15     Ideal     F 3374.939 0.6558285
## 16      Fair     G 4239.255 1.0238217
## 17      Good     G 4123.482 0.8508955
## 18 Very Good     G 3872.754 0.7667986
## 19   Premium     G 4500.742 0.8414877
## 20     Ideal     G 3720.706 0.7007146
## 21      Fair     H 5135.683 1.2191749
## 22      Good     H 4276.255 0.9147293
## 23 Very Good     H 4535.390 0.9159485
## 24   Premium     H 5216.707 1.0164492
## 25     Ideal     H 3889.335 0.7995249
## 26      Fair     I 4685.446 1.1980571
## 27      Good     I 5078.533 1.0572222
## 28 Very Good     I 5255.880 1.0469518
## 29   Premium     I 5946.181 1.1449370
## 30     Ideal     I 4451.970 0.9130291
## 31      Fair     J 4975.655 1.3411765
## 32      Good     J 4574.173 1.0995440
## 33 Very Good     J 5103.513 1.1332153
## 34   Premium     J 6294.592 1.2930941
## 35     Ideal     J 4918.186 1.0635937