10 How to plot data
How to plot data? This is a big question, and here I can give a quick/brief answer, which is this two-step procedure. Step 1: Get the data ready. Step 2: Use ggplot2 package (or another package, e.g. treemap package, for some a specific plot). In the following 13 sections, I will use examples to illustrate the two-step procedure.
10.1 Creating basic bar charts
Essentially, a basic bar chart is a plot of a categorical variable on x-axis and a numerical variable on y-axis.
Example 1: a basic bar chart.
rm(list = ls())
# load packages
library(ggplot2)
# prepare a dataframe for plotting
fruits <- c("apple", "orange", "banana")
the_fruits <- sample(fruits, 100, replace = TRUE)
plotting_df <- as.data.frame.table(table(the_fruits))
# plotting
p <- ggplot(plotting_df, aes(x = the_fruits, weight = Freq)) +
# NB: use "weight = Freq" instead of "y = Freq"
geom_bar(width = 0.5, fill = "blue") +
# NB: use "width" and "fill" to change the default bar width and color
labs(x = "", y = "Frequency",
title = "A basic bar chart for a basket of fruits") +
theme(plot.title = element_text(hjust = 0.5)) +
# NB: use theme to center the title
geom_text(aes(x = the_fruits, y = Freq + 1, label = Freq))
# NB: use "geom_text" to put the the numbers to indicate heights of bars
print(p)
Example 2: still a basic bar chart but making the bars horizontal and based on percentage
rm(list = ls())
# load packages
library(ggplot2)
library(dplyr)
# prepare a dataframe for plotting
fruits <- c("apple", "orange", "banana", "pear", "plum",
"kiwi fruit", "peach", "mango", "lemon")
the_fruits <- sample(fruits, 1000, replace = TRUE)
a_table <- table(the_fruits)
plotting_df <-
as.data.frame.table(a_table) %>%
mutate(proportion = Freq / sum(Freq))
## Create a vector to order the fruits in terms of proportion
for_sorting <- plotting_df %>%
arrange(proportion)
fruits_order <- for_sorting$the_fruits
# plotting
p <- ggplot(plotting_df, aes(x = the_fruits, weight = proportion)) +
# NB: use "weight = proportion" instead of "y = proportion"
geom_bar(width = 0.5, fill = "blue") +
# NB: use "width" and "fill" to change the default bar width and color
labs(x = "", y = "",
title = "A 'horizontal' bar chart for a basket of fruits") +
coord_flip() +
# NB: use "coord_flip" to flip coordinates
scale_x_discrete(limits = fruits_order) +
# NB: use the above to set the order of bars
scale_y_continuous(limits = c(0, max(plotting_df$proportion)+0.015)) +
# NB: use the above to make the plot slightly bigger than the default one
geom_text(aes(x = the_fruits, y = proportion + 0.006,
label = scales::percent(proportion))) +
# NB: use the above to put the the pentage numbers to indicate lengths of bars
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank(),
axis.ticks = element_blank())
# NB: use theme to center the title, to remove axis text and ticks
print(p)
10.2 Creating side-by-side and stacked bar charts
Example 3
rm(list = ls())
# load packages
library(ggplot2)
library(dplyr)
#-------------------
# Aim: To plot numbers of all kinds of fruits in "local" and "imported" groups
#-------------------
# prepare a dataframe for plotting
fruits <- c("apple", "orange", "banana", "pear", "plum",
"kiwi fruit", "peach", "mango", "lemon")
origin <- c("local", "imported")
a_df <- data.frame(the_fruits = sample(fruits, 1000, replace = TRUE),
the_origin = sample(origin, 1000, replace = TRUE))
plotting_df <-
a_df %>%
group_by(the_origin, the_fruits) %>%
summarise(Freq = n())
# ------------ side-by-side bar chart --------------
## set the order of bars according to local fruits
temp_df_1 <-
plotting_df %>%
filter(the_origin == "local") %>%
arrange(Freq)
the_order <- temp_df_1$the_fruits
## plot a side-by-side bar chart
p1 <-
plotting_df %>%
ggplot(aes(x = the_fruits, weight = Freq, fill = the_origin)) +
geom_bar(position = "dodge", width = 0.75) +
# NB: use the above to plot bars in the certain order
coord_flip() +
scale_x_discrete(limits = the_order) +
labs(x = "", y = "Number of fruits in the 'basket'") +
scale_fill_brewer(breaks=c("local", "imported"), palette = "Set1") +
# NB: use the above to change the default order and color of legend
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.text = element_text(size=12),
axis.title = element_text(size=14),
plot.title = element_text(size=14),
legend.text = element_text(size=9),
panel.background = element_rect(fill = "grey90"))
print(p1)
# ------------ stacked bar chart --------------
## set the order according to totals
temp_df_2 <-
a_df %>%
group_by(the_fruits) %>%
summarise(the_count = n()) %>%
arrange(the_count)
the_order_2 <- temp_df_2$the_fruits
## plot a stacked bar chart
p2 <-
plotting_df %>%
ggplot(aes(x = the_fruits, y = Freq, group = the_origin, fill = the_origin)) +
# NB: use "y = Freq" instead of "weight = Freq"
geom_bar(stat = "identity", position = "stack", width = 0.75) +
coord_flip() +
scale_x_discrete(limits = the_order_2) +
# NB: use the above to plot the bars in order
labs(x = "", y = "Number of fruits in the 'basket'") +
scale_fill_brewer(breaks=c("local", "imported"), palette = "Set1") +
# NB: use the above to change the default order and color of legend
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.text = element_text(size=12),
axis.title = element_text(size=14),
plot.title = element_text(size=14),
legend.text = element_text(size=9),
panel.background = element_rect(fill = "grey90"))
print(p2)
10.3 Creating back-to-back bar charts
Example 4
rm(list = ls())
# load packages
library(dplyr)
library(ggplot2)
# create a fake data set
## some preparation
set.seed(123)
ten_positive_rand_numbers <- abs(rnorm(10)) + 0.1
the_prob <- ten_positive_rand_numbers / sum(ten_positive_rand_numbers)
fk_data <- data.frame(job_type = sample(LETTERS[1:10], 1000,
replace = TRUE, prob = the_prob),
gender = sample(c("Male", "Female"), 1000,
replace = TRUE))
# prepare data for plotting
plotting_df <-
fk_data %>%
group_by(job_type, gender) %>%
summarise(Freq = n()) %>%
# a trick!
mutate(Freq = if_else(gender == "Male", -Freq, Freq))
## find the order
temp_df <-
plotting_df %>%
filter(gender == "Female") %>%
arrange(Freq)
the_order <- temp_df$job_type
# plot
p <-
plotting_df %>%
ggplot(aes(x = job_type, y = Freq, group = gender, fill = gender)) +
geom_bar(stat = "identity", width = 0.75) +
coord_flip() +
scale_x_discrete(limits = the_order) +
# another trick!
scale_y_continuous(breaks = seq(-150, 150, 50),
labels = abs(seq(-150, 150, 50))) +
labs(x = "Job type", y = "Count", title = "Back-to-back bar chart") +
theme(legend.position = "bottom",
legend.title = element_blank(),
plot.title = element_text(hjust = 0.5),
panel.background = element_rect(fill = "grey90")) +
# reverse the order of items in legend
# guides(fill = guide_legend(reverse = TRUE)) +
# change the default colors of bars
scale_fill_manual(values=c("red", "blue"),
name="",
breaks=c("Male", "Female"),
labels=c("Male", "Female"))
print(p)
Remark: We can use
scale_x_discrete(limits = rev(the_order))
to replace
scale_x_discrete(limits = the_order)
and the resulted chart is also called pyramid chart. If you google “population pyramid” you can find more examples of pyramid charts.
10.4 Creating Pareto charts
A Pareto chart basically is a bar chart (with the bars ordered) plus a frequency polygon (i.e. a line chart). It is useful for revealing something like the 80-20 rule—e.g. 80% of the accidents are due to 20% of the possible reasons. See https://en.wikipedia.org/wiki/Pareto_chart for more details. The following example shows how to make a Pareto chart. Please pay attention to how the layers are built up.
Example 5
rm(list = ls())
# load packages
library(dplyr)
library(ggplot2)
# create a fake data set
reasons <- c("Reason A", "Reason B", "reason C",
"Reason D", "Reason E", "reason F")
set_prob <- c(0.1, 0.2, 0.6, 0.05, 0.02, 0.03)
fk_data <-
data.frame(accident_NO = 1:1000,
reason = sample(reasons, 1000, replace = TRUE, prob = set_prob))
# prepare the data for plotting
plotting_df <-
fk_data %>%
group_by(reason) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
mutate(relative_freq = freq / sum(freq),
cumulative_freq = cumsum(relative_freq))
## get the order of bars
the_order <- plotting_df$reason
# plot
p <-
plotting_df %>%
ggplot(aes(x = reason, weight = relative_freq)) +
geom_bar(width = 0.5, fill = "blue") +
scale_x_discrete(limits = the_order) +
scale_y_continuous(label = scales::percent) +
geom_point(aes(x = reason, y = cumulative_freq)) +
geom_line(aes(x = reason, y = cumulative_freq, group = 1)) +
# NB: Must use "group = 1"
labs(x = "", y = "Relative frequency",
title = "A Pareto diagram for reasons of 1000 accidents") +
theme(plot.title = element_text(hjust = 0.5))
# NB: Use theme to center the title
print(p)
10.5 Creating lollipop charts
Notice that a lollipo=a segment+a point, thus it is natural to use geom_segment() and geom_point() to create lollipop charts.
Example 6
rm(list = ls())
library(ggplot2)
library(dplyr)
# create a fake data set
set.seed(9072017)
rand_numbers <- abs(rnorm(26))
the_prob <- rand_numbers/sum(rand_numbers)
fk_data <- data.frame(x = sample(LETTERS, 10000, replace = TRUE,
prob = the_prob))
# prepare data for plotting
plotting_df <-
fk_data %>%
group_by(x) %>%
summarise(Freq = n()) %>%
mutate(proportion = Freq/sum(Freq)) %>%
arrange(proportion)
the_order <- plotting_df$x
# plotting
p <-
plotting_df %>%
ggplot(aes(x = x, y = proportion)) +
geom_segment(aes(x = x, xend = x, y = 0, yend = proportion)) +
# use the above to plot segments
geom_point() +
# use the above to plot points
scale_x_discrete(limits = the_order) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Category", y = "Proportion",
title = "A lollipop chart") +
theme(plot.title = element_text(hjust = 0.5))
# use the above to center the title
print(p)
10.6 Creating treemaps
A treepmap can show three variables by using lables, sizes of rectangles and colors. Below is a treemap of the top 15 NZ’s most populous cities based on the 2016 data. The original data comes from: https://en.wikipedia.org/wiki/List_of_cities_in_New_Zealand
Example 7
rm(list = ls())
# load packages
library(treemap)
library(readr) # for read_csv
# read data in
the_url <- "https://raw.githubusercontent.com/LarryZhang2016/Data/master/NZ_cities.csv"
NZ_cities <- read_csv(the_url, skip =1)
# make a tree map
treemap(dtf = NZ_cities,
index=c("City_name"),
vSize="Population",
vColor="Population_density",
palette="Spectral",
type="value",
border.col=c("grey70", "grey90"),
fontsize.title = 18,
algorithm="pivotSize",
title ="Treemap of the top 15 NZ's most populous cities",
title.legend="Population density (people/km^2)")
10.7 Creating scatter plots
A scatter plot is very useful for exploring the relationship between two continuous variables. With the following example, we show how to create a scatter plot. We want to emphasize the details, that is,
- label properly
- mark the outliers
- add in the regression line
- refit data and add in the new regression line
Example 8
rm(list = ls())
# load packages
library(readr) # for read_csv
library(ggplot2)
# read data in
the_url <- "https://raw.githubusercontent.com/LarryZhang2016/Data/master/NZ_cities.csv"
NZ_cities <- read_csv(the_url, skip =1)
p1 <-
ggplot(NZ_cities, aes(x = Area_in_km2, y = Population)) +
geom_point() +
scale_y_continuous(labels = scales::comma) +
# NB: use the above to mark large numbers
labs(x = "Area (in km^2)",
title = paste0("Population vs. area for the \n",
"top 15 NZ's most populous cities")) +
# NB: use paste0 to break a long line to two lines
theme(plot.title = element_text(hjust = 0.5))
print(p1)
# Next, we want to label the points for
# Auckland, Wellington, Christchurch, and Hamilton with their names and red
# Also, add the regression line in
# load packages
library(dplyr)
library(ggrepel) # for geom_text_repel()
# add two new columns to NZ_cities
biggest_cities <- c("Auckland", "Wellington", "Christchurch","Hamilton")
NZ_cities_1 <-
NZ_cities %>%
mutate(the_label = if_else(City_name %in% biggest_cities, City_name, ""),
the_color = if_else(City_name %in% biggest_cities, "red", "black"))
p2 <-
p1 +
geom_text_repel(data = NZ_cities_1, aes(label = the_label)) +
geom_point(color = NZ_cities_1$the_color) +
# add in the regression line
geom_smooth(method = "lm", se = FALSE)
print(p2)
# Finally, refit the data after removing "Auckland", "Wellington", "Christchurch"
# add in the new regression line
NZ_cities_2 <-
NZ_cities %>%
filter(!(City_name %in% biggest_cities[1:3])) %>%
select(City_name, Population, Area_in_km2)
## find the regression equtions
line_1 <- lm(Population ~ Area_in_km2, NZ_cities)
line_2 <- lm(Population ~ Area_in_km2, NZ_cities_2)
line_1_eq <- paste0("Line 1: ", "Population = ",
round(line_1[[1]][1], 2), " + ",
round(line_1[[1]][2], 2), " * Area")
line_2_eq <- paste0("Line 2: ", "Population = ",
round(line_2[[1]][1], 2), " + ",
round(line_2[[1]][2], 2), " * Area")
p3 <-
p1 +
geom_text_repel(aes(label = NZ_cities_1$the_label)) +
geom_point(color = NZ_cities_1$the_color) +
# add in the regression line
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_smooth(data = NZ_cities_2, method = "lm", se = FALSE, color = "purple") +
annotate(geom = "text", x = 400, y = 1400000,
label=line_1_eq, color="blue") +
annotate(geom = "text", x = 400, y = 1250000,
label=line_2_eq, color="purple")
print(p3)
10.8 Creating side-by-side box plots
Roughly speaking, a box plot shows the five-number summary—i.e. minimum, first quartile, second quartile, third quartile, and maximum—of data. Plotting several box plots together, we have the so-called side-by-side box plot, which is useful for comparison of data among groups.
In the following example, we will create a side-by-side box plot for random numbers drawn from the standard normal distribution, the t distribution with five degrees of freedom, the uniform distribution on (−3,3), and the double exponential distribution with the probability density f(y)=12λe−λ|y| for −∞<y<+∞, where λ=√2π.
For our purpose, we need this
Technical note: We can show that if X∼Exp(λ), U∼Uniform(0,1), and X and U are independent, then Y={−X,if U≤0.5,X,if U>0.5, has a double exponential distribution; that is, the probability density function of Y is f(y)=λ2e−λ|y| for −∞<y<+∞.
Example 9
rm(list = ls())
# load packages
library(dplyr)
library(tidyr) # for gather()
library(ggplot2)
# create a fake data set
set.seed(1234567)
fk_data <-
data.frame(Normal = rnorm(1000),
t_df_5 = rt(1000, df = 5),
Unif = runif(1000, -3, 3),
Exp = rexp(1000, rate = sqrt(2/pi)),
Unif_temp = runif(1000, 0, 1)) %>%
mutate(the_indi = if_else(Unif_temp <= 0.5, -1, 1)) %>%
mutate(Double_exp = Exp * the_indi) %>%
select(-Exp, -Unif_temp, -the_indi)
# prepare data for plotting
plotting_df <-
fk_data %>%
gather(key = distribution, value = rand_number, Normal:Double_exp)
# plot
p <-
plotting_df %>%
ggplot(aes(x = distribution, y = rand_number, group = distribution)) +
geom_boxplot() +
coord_flip() +
scale_x_discrete(breaks = c("Double_exp", "Normal", "t_df_5", "Unif"),
labels = c("Double Exponential",
"Standard Normal",
"t with df=5",
"Uniform on (-3, 3)")) +
# NB: use the above to change x-axis tick marks
labs(x = "Distribution", y = "", title = "Side-by-side box plot") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
10.9 Creating grid plots
Grid plots allow us to show several (e.g. four) variables in one plot, and certainly they are useful. The key here is to use facet_grid().
Example 10
rm(list = ls())
# load packages
library(dplyr)
library(tidyr)
library(ggplot2)
# create a fake data set
## a helper function
set.seed(21072017)
create_year_data <- function(year = 2015, n = 20)
{temp_df <- data.frame(year = rep(year, n),
gender = sample(c("male", "female"), n, replace = TRUE),
stats_grade = rnorm(n, mean = 55, sd = 10),
math_grade = rnorm(n, mean = 60, sd = 10))
return(temp_df)
}
data_2016 <- create_year_data(year = 2016, n = 20)
data_2015 <- create_year_data(year = 2015, n = 20)
data_2014 <- create_year_data(year = 2014, n = 20)
fk_data <- bind_rows(data_2016, data_2015, data_2014)
# prepare data for plotting
plotting_df <-
fk_data %>%
group_by(year, gender) %>%
summarise(Stats = mean(stats_grade),
Maths = mean(math_grade)) %>%
ungroup() %>%
# make a long table
gather(key = subject, value = grade, -year, -gender) %>%
arrange(year)
# plot
the_title <- paste0("Averge maths and stats grades for\n",
"female and male students in 2014-2016")
p <-
plotting_df %>%
ggplot(aes(y = grade, color = gender)) +
geom_segment(aes(x = gender, xend = gender, y = 0, yend = grade)) +
geom_point(aes(x = gender, y = grade)) +
coord_flip() +
scale_x_discrete(limits = c("male", "female")) +
facet_grid(year ~ subject) +
labs(x = "", y = "Average Grade",
title = the_title) +
theme(plot.title = element_text(hjust = 0.5),
legend.title = element_blank(),
panel.background = element_rect(fill = "grey90"))
print(p)
10.10 Creating a simple PCA plot
When we have an n-variate (n≥3) data set, where each column contains continuous type data, we often want to look at the cluster relationship among the m observations (or rows). For this purpose. we can make a PCA (Principal Component Analysis) plot. The fundamental idea here is that we map the n-dimension data to 2-dimension (PC1 and PC2) data and then make a scatter plot of the 2-dimension data.
Example 11
rm(list=ls())
# load packages
library(dplyr, quietly = TRUE)
library(ggplot2, quietly = TRUE)
# NB: We will use iris, which is a data set from R
(head(iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# do the mapping
## step 1: find the covariance
the_cov <- cov(iris[, 1:4])
## step 2: find the eigen values and vectors
the_eigen <- eigen(the_cov)
## Remark: steps 1 and 2 together is equivalent to
## PC <- prcomp(iris[, 1:4])
## step 3: mapping
map_2_PC1_PC2 <-
as.matrix(iris[,1:4]) %*% the_eigen[[2]][, 1:2] %>%
as.data.frame()
# check how much variance are expressed by PC1 and PC2
(the_proportions = the_eigen[[1]][1:2]/sum(the_eigen[[1]]))
## [1] 0.92461872 0.05306648
# prepare dataframe for plotting
temp_df <- data.frame(Species = iris[, 5])
plotting_df <-
bind_cols(map_2_PC1_PC2, temp_df) %>%
rename(PC1 = V1, PC2 = V2)
# plotting
ggplot(plotting_df, aes(x = PC1, y = PC2, color = Species)) +
geom_point()
10.11 Creating time series plots
It is easy to create a time series plot. Here we pay attention to some “small” things.
- Make sure the time is labelled correctly on x-axis.
- If the values are for money and large, then we should show the $ sign and use “,” in the numbers for labeling y-axis.
- We often want to get the points connected to show that the points are related.
Example 12
rm(list = ls())
# load packages
library(ggplot2)
library(lubridate) # for dealing with data related to time
# create a fake data set
for_year <- 1997:2016
for_month <- rep(12, 20)
for_day <- rep(31, 20)
col_1 <- paste0(for_year, "-", for_month, "-", for_day)
col_2 <- rep(0, 20)
## simulated data from a random walk
set.seed(20170805)
epsilon <- rnorm(20, mean = 0, sd = 10000)
x0 <- 5*1e5
for(i in 1:20)
{col_2[i] <- x0 + epsilon[i]
x0 <- col_2[i]
}
fk_data <-
data.frame(EndOfYear = ymd(col_1),
Value_in_dollar = round(col_2, 0))
# plot data
p <-
ggplot(fk_data, aes(x = year(EndOfYear), y = Value_in_dollar)) +
# Note the use of function year()
geom_point() +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
labs(x = "Year", y = "Market value", title = "Plot of a time series") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
10.12 Showing pop-up’s
For exploratory data analysis, we may want our plot to have such a feature, which is when we hover the mouse on the plot some information will pop up. In the following example, I will show how to do it with plotly::ggplotly() (thanks to Chris Hansen for pointing this function to me.) Of course there are other useful R packages available for showing pop-up’s, such as googleVis and highcharter, if having an interest the readers can explore them.
Example 13
rm(list = ls())
# load packages
library(readr) # for read_csv
library(ggplot2)
library(plotly)
# read data in
the_url <- "https://raw.githubusercontent.com/LarryZhang2016/Data/master/NZ_cities.csv"
NZ_cities <- read_csv(the_url, skip =1)
p1 <-
ggplot(NZ_cities, aes(x = Area_in_km2, y = Population, fill = City_name)) +
geom_point() +
scale_y_continuous(labels = scales::comma) +
# NB: use the above to mark large numbers
labs(x = "Area (in km^2)", y = "PopSize",
title = paste0("Population vs. area for the ",
"top 15 NZ's most populous cities")) +
# NB: use paste0 to break a long line to two lines
theme(plot.title = element_text(hjust = 0.5),
legend.position="none")
ggplotly(p1) %>% config(displayModeBar = FALSE)
10.13 Putting plots in one panel
We create a few plots and want to put them together. It is handy to do so with gridExtra::grid.arrange(). (I thank Peter Ellis for pointing me to this function.)
Example 14
rm(list = ls())
# load packages
library(gridExtra)
library(ggplot2)
# a function for plotting probability density functions
plot_density <- function(func_name = dnorm, para = list(mean=0, sd=1),
domain = data.frame(x = c(-3, 3)),
title_lable = "PDF of N(0, 1)")
{p <- ggplot(domain, aes(x)) +
stat_function(fun = func_name, args = para, color = "red") +
labs(x = "x", y = "f(x)", title = title_lable) +
theme(plot.title = element_text(hjust = 0.5)) # make the title in center
return(p)
}
# plot four probability density functions
p1 <- plot_density()
p2 <- plot_density(func_name = dt, para = list(df=30),
title_lable = "PDF of t distribution with df=30")
p3 <- plot_density(func_name = dexp, para = list(rate = 1),
domain = data.frame(x = c(0, 10)),
title_lable = "PDF of Exp(1) distribution")
p4<- plot_density(func_name = dchisq, para = list(df=5),
domain = data.frame(x = c(0, 10)),
title_lable = "PDF of Chisq distribution with df=5")
# put the four plots together
grid.arrange(p1, p2, p3, p4, newpage = TRUE,
layout_matrix = matrix(1:4, byrow = TRUE, 2, 2))