8 How to tally
As data analysts, we must know how to tally data. For tallying one or two variables, we can use function table() plus other helper functions as shown in the previous chapter. Here, we will learn how to do the job for \(n\) variables (where \(n\ge 1\)) using dplyr package.
Example 1
rm(list = ls())
# load packages
library(dplyr)
library(tidyr) # for replace_na()
set.seed(28072017)
# create a fake data
prob_1 <- c(0.2, 0.5, 0.3)
prob_2 <- c(0.3, 0.7)
set_1 <- c("long", "medium", "short")
set_2 <- c("heavy", "light")
set_3 <- c("red", "yellow", "green")
fk_data <- data.frame(size = sample(set_1, 20, replace = TRUE, prob = prob_1),
weight = sample(set_2, 20, replace = TRUE, prob = prob_2),
color = sample(set_3, 20, replace = TRUE))
# tally one variable
tally_1 <-
fk_data %>%
group_by(color) %>%
tally()
print.data.frame(tally_1)
## color n
## 1 green 8
## 2 red 7
## 3 yellow 5
# tally two varaibles
tally_2 <-
fk_data %>%
group_by(size, weight) %>%
tally()
print.data.frame(tally_2)
## size weight n
## 1 long heavy 1
## 2 long light 3
## 3 medium heavy 4
## 4 medium light 6
## 5 short heavy 2
## 6 short light 4
# tally three variables
tally_3 <-
fk_data %>%
group_by(size, weight, color) %>%
tally() %>%
arrange(size, weight, color)
print.data.frame(tally_3)
## size weight color n
## 1 long heavy yellow 1
## 2 long light green 1
## 3 long light red 1
## 4 long light yellow 1
## 5 medium heavy green 3
## 6 medium heavy yellow 1
## 7 medium light green 2
## 8 medium light red 3
## 9 medium light yellow 1
## 10 short heavy green 1
## 11 short heavy red 1
## 12 short light green 1
## 13 short light red 2
## 14 short light yellow 1
# The following is to refine tally3
# to show all combinations of levels of the three factors
# Step a: to have a dataframe which has
# all the level combinations
levels_comb_df <-
expand.grid(size = levels(fk_data$size),
weight = levels(fk_data$weight),
color = levels(fk_data$color))
# Step b: create tally4 by join
# Notice the difference between tally3 and tally 4
tally_4 <-
levels_comb_df %>%
left_join(tally_3, by =c("size" = "size",
"weight" = "weight",
"color" = "color")) %>%
# replace na by 0
replace_na(list(n=0)) %>%
arrange(size, weight, color)
print.data.frame(tally_4)
## size weight color n
## 1 long heavy green 0
## 2 long heavy red 0
## 3 long heavy yellow 1
## 4 long light green 1
## 5 long light red 1
## 6 long light yellow 1
## 7 medium heavy green 3
## 8 medium heavy red 0
## 9 medium heavy yellow 1
## 10 medium light green 2
## 11 medium light red 3
## 12 medium light yellow 1
## 13 short heavy green 1
## 14 short heavy red 1
## 15 short heavy yellow 0
## 16 short light green 1
## 17 short light red 2
## 18 short light yellow 1
Using the same fake data as that of Example 1, in Example 2, we show how to find the number of distinct weight-color combinations for each of the “long”, “medium” and “short” subgroups. At work, sometimes we need to deal with this kind of problems. The solution comes from https://stackoverflow.com/questions/43690920/count-subgroups-in-group-by-with-dplyr
Example 2
rm(list = ls())
# load packages
library(dplyr)
library(tidyr) # for replace_na()
set.seed(28072017)
# create a fake data
prob_1 <- c(0.2, 0.5, 0.3)
prob_2 <- c(0.3, 0.7)
set_1 <- c("long", "medium", "short")
set_2 <- c("heavy", "light")
set_3 <- c("red", "yellow", "green")
fk_data <- data.frame(size = sample(set_1, 20, replace = TRUE, prob = prob_1),
weight = sample(set_2, 20, replace = TRUE, prob = prob_2),
color = sample(set_3, 20, replace = TRUE))
tally_5 <-
fk_data %>%
group_by(size) %>%
summarise(n = n_distinct(weight, color))
print.data.frame(tally_5)
## size n
## 1 long 4
## 2 medium 5
## 3 short 5