# 8 How to tally

As data analysts, we must know how to tally data. For tallying one or two variables, we can use function table() plus other helper functions as shown in the previous chapter. Here, we will learn how to do the job for $$n$$ variables (where $$n\ge 1$$) using dplyr package.

Example 1

rm(list = ls())

library(dplyr)
library(tidyr) # for replace_na()

set.seed(28072017)

# create a fake data
prob_1 <- c(0.2, 0.5, 0.3)
prob_2 <- c(0.3, 0.7)
set_1 <- c("long", "medium", "short")
set_2 <- c("heavy", "light")
set_3 <- c("red", "yellow", "green")
fk_data <- data.frame(size = sample(set_1, 20, replace = TRUE, prob = prob_1),
weight = sample(set_2, 20, replace = TRUE, prob = prob_2),
color = sample(set_3, 20, replace = TRUE))

# tally one variable
tally_1 <-
fk_data %>%
group_by(color) %>%
tally()
print.data.frame(tally_1)
##    color n
## 1  green 8
## 2    red 7
## 3 yellow 5
# tally two varaibles
tally_2 <-
fk_data %>%
group_by(size, weight) %>%
tally()
print.data.frame(tally_2)
##     size weight n
## 1   long  heavy 1
## 2   long  light 3
## 3 medium  heavy 4
## 4 medium  light 6
## 5  short  heavy 2
## 6  short  light 4
# tally three variables
tally_3 <-
fk_data %>%
group_by(size, weight, color) %>%
tally() %>%
arrange(size, weight, color)
print.data.frame(tally_3)
##      size weight  color n
## 1    long  heavy yellow 1
## 2    long  light  green 1
## 3    long  light    red 1
## 4    long  light yellow 1
## 5  medium  heavy  green 3
## 6  medium  heavy yellow 1
## 7  medium  light  green 2
## 8  medium  light    red 3
## 9  medium  light yellow 1
## 10  short  heavy  green 1
## 11  short  heavy    red 1
## 12  short  light  green 1
## 13  short  light    red 2
## 14  short  light yellow 1
# The following is to refine tally3
# to show all combinations of levels of the three factors
# Step a: to have a dataframe which has
# all the level combinations
levels_comb_df <-
expand.grid(size = levels(fk_data$size), weight = levels(fk_data$weight),
color = levels(fk_data\$color))
# Step b: create tally4 by join
# Notice the difference between tally3 and tally 4
tally_4 <-
levels_comb_df %>%
left_join(tally_3, by =c("size" = "size",
"weight" = "weight",
"color" = "color")) %>%
# replace na by 0
replace_na(list(n=0)) %>%
arrange(size, weight, color)
print.data.frame(tally_4)
##      size weight  color n
## 1    long  heavy  green 0
## 2    long  heavy    red 0
## 3    long  heavy yellow 1
## 4    long  light  green 1
## 5    long  light    red 1
## 6    long  light yellow 1
## 7  medium  heavy  green 3
## 8  medium  heavy    red 0
## 9  medium  heavy yellow 1
## 10 medium  light  green 2
## 11 medium  light    red 3
## 12 medium  light yellow 1
## 13  short  heavy  green 1
## 14  short  heavy    red 1
## 15  short  heavy yellow 0
## 16  short  light  green 1
## 17  short  light    red 2
## 18  short  light yellow 1

Using the same fake data as that of Example 1, in Example 2, we show how to find the number of distinct weight-color combinations for each of the “long”, “medium” and “short” subgroups. At work, sometimes we need to deal with this kind of problems. The solution comes from https://stackoverflow.com/questions/43690920/count-subgroups-in-group-by-with-dplyr

Example 2

rm(list = ls())

library(dplyr)
library(tidyr) # for replace_na()

set.seed(28072017)

# create a fake data
prob_1 <- c(0.2, 0.5, 0.3)
prob_2 <- c(0.3, 0.7)
set_1 <- c("long", "medium", "short")
set_2 <- c("heavy", "light")
set_3 <- c("red", "yellow", "green")
fk_data <- data.frame(size = sample(set_1, 20, replace = TRUE, prob = prob_1),
weight = sample(set_2, 20, replace = TRUE, prob = prob_2),
color = sample(set_3, 20, replace = TRUE))

tally_5 <-
fk_data %>%
group_by(size) %>%
summarise(n = n_distinct(weight, color))
print.data.frame(tally_5)
##     size n
## 1   long 4
## 2 medium 5
## 3  short 5