Part 10 Week 5

Data Visualization in R with ggplot2 > Week 2

10.1 Bar Plots Part 1, 2

library(tidyverse)

cel <-
  read_csv(
    url(
      "https://www.dropbox.com/s/4ebgnkdhhxo5rac/cel_volden_wiseman%20_coursera.csv?raw=1"
    )
  )
#> 
#> ── Column specification ─────────────────────────────────────────────
#> cols(
#>   .default = col_double(),
#>   thomas_name = col_character(),
#>   st_name = col_character()
#> )
#> ℹ Use `spec()` for the full column specifications.
####bar plot for dems variable in the 115th Congress. 0=Republican, 1=Democrat

cel %>%
  filter(congress == 115) %>%
  ggplot(aes(x = dem)) +
  geom_bar()

###prove to yourself your bar plot is right by comparing with a frequency table:
table(filter(cel, congress == 115)$dem)
#> 
#>   0   1 
#> 245 203
###use st_name instead, so how counts of how many members of Congress from each state:

cel %>% filter(congress == 115) %>% ggplot(aes(x = st_name)) + geom_bar()

###flip the figure by setting y aesthetic rather than the x

cel %>% filter(congress == 115) %>% ggplot(aes(y = st_name)) + geom_bar()

###let's go back and recode the dem variable to be a categorical variable

party <- recode(cel$dem, `1` = "Democrat", `0` = "Republican")

cel <- add_column(cel, party)

cel %>% filter(congress == 115) %>% ggplot(aes(x = party)) +
  geom_bar()

####now add some visual touches

###add axis labels
cel %>% filter(congress == 115) %>% ggplot(aes(x = party)) +
  geom_bar() +
  labs(x = "Party", y = "Number of Members")

###add colors for the two different bars
cel %>% filter(congress == 115) %>% ggplot(aes(x = party, fill = party)) +
  geom_bar() +
  labs(x = "Party", y = "Number of Members")

###manually change the colors of the bars
cel %>% filter(congress == 115) %>% ggplot(aes(x = party, fill = party)) +
  geom_bar() +
  labs(x = "Party", y = "Number of Members") +
  scale_fill_manual(values = c("blue", "red"))

###drop the legend with the "guides" command
cel %>% filter(congress == 115) %>% ggplot(aes(x = party, fill = party)) +
  geom_bar() +
  labs(x = "Party", y = "Number of Members") +
  scale_fill_manual(values = c("blue", "red")) +
  guides(fill = FALSE)

#####Making more barplots and manipulating more data in R

####Making a barplot of proportions

#####a toy demonstration
#####a bowl of fruit
apple <- rep("apple", 6)
orange <- rep("orange", 3)
banana <- rep("banana", 1)

###put together the fruits in a dataframe
###creates a single columns with fruits
fruit_bowl <- tibble("fruits" = c(apple, orange, banana))

########Let's calculate proportions instead

#####create a table that counts fruits in a second column
fruit_bowl_summary <- fruit_bowl %>%
  group_by(fruits) %>%
  summarize("count" = n())

fruit_bowl_summary
#> # A tibble: 3 x 2
#>   fruits count
#>   <chr>  <int>
#> 1 apple      6
#> 2 banana     1
#> 3 orange     3
####calculate proportions
fruit_bowl_summary$proportion <-
  fruit_bowl_summary$count / sum(fruit_bowl_summary$count)

fruit_bowl_summary
#> # A tibble: 3 x 3
#>   fruits count proportion
#>   <chr>  <int>      <dbl>
#> 1 apple      6        0.6
#> 2 banana     1        0.1
#> 3 orange     3        0.3
####add the geom_bar, using "stat" to tell command to plot the exact value for proportion
ggplot(fruit_bowl_summary, aes(x = fruits, y = proportion)) +
  geom_bar(stat = "identity")

ggplot(fruit_bowl_summary, aes(x = fruits, y = proportion, fill = fruits)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("red", "yellow", "orange")) +
  guides(fill = FALSE) +
  labs(x = "Fruits", y = "Proportion of Fruits")

####More practice with barplots!

#####
cces <-
  read_csv(
    url(
      "https://www.dropbox.com/s/ahmt12y39unicd2/cces_sample_coursera.csv?raw=1"
    )
  )
#> 
#> ── Column specification ─────────────────────────────────────────────
#> cols(
#>   .default = col_double()
#> )
#> ℹ Use `spec()` for the full column specifications.
####create counts of Ds, Rs, and Is by region

dem_rep <-
  recode(
    cces$pid7,
    `1` = "Democrat",
    `2` = "Democrat",
    `3` = "Democrat",
    `4` = "Independent",
    `5` = "Republican",
    `6` = "Republican",
    `7` = "Republican"
  )

table(dem_rep)
#> dem_rep
#>    Democrat Independent  Republican 
#>         516         119         365
cces <- add_column(cces, dem_rep)

###stacked bars
ggplot(cces, aes(x = region, fill = dem_rep)) +
  geom_bar()

###grouped bars
ggplot(cces, aes(x = region, fill = dem_rep)) +
  geom_bar(position = "dodge")

##visual touches like relabeling the axes
ggplot(cces, aes(x = region, fill = dem_rep)) +
  geom_bar(position = "dodge") +
  labs(x = "Region", y = "Count")

10.2 Line Plots Part 1

library(tidyverse)

####create a sequence of years

years <- seq(from = 2001, to = 2020, by = 1)

####create "fake" data for price (note, your values will be different)

price <- rnorm(20, mean = 15, sd = 5)

####put years and price together

fig_data <- tibble("year" = years, "stock_price" = price)

ggplot(fig_data, (aes(x = years, y = price))) +
  geom_line()

####make data for the first of two stocks

fig_data$stock_id = rep("Stock_1", 20)

stock_1_time_series <- fig_data

#####create data for the second company
########same approach as with the last company
stock_id <- rep("Stock_2", 20)

years <- seq(from = 2001, to = 2020, by = 1)

price <- rnorm(20, mean = 10, sd = 3)

stock_2_time_series <-
  tibble("stock_id" = stock_id,
         "year" = years,
         "stock_price" = price)

####combine with bind_rows()

all_stocks_time_series <-
  bind_rows(stock_1_time_series, stock_2_time_series)

# View(all_stocks_time_series)

####make the plot, setting group to stock_id

ggplot(all_stocks_time_series, (aes(
  x = year, y = stock_price, group = stock_id
))) +
  geom_line()

####modify group, linetype, color, and add facet_wrap()

ggplot(all_stocks_time_series, (
  aes(
    x = year,
    y = stock_price,
    group = stock_id,
    linetype = stock_id,
    color = stock_id
  )
)) +
  geom_line() +
  facet_wrap( ~ stock_id)

#####Practice with another data set

cel <-
  read_csv(
    url(
      "https://www.dropbox.com/s/4ebgnkdhhxo5rac/cel_volden_wiseman%20_coursera.csv?raw=1"
    )
  )
#> 
#> ── Column specification ─────────────────────────────────────────────
#> cols(
#>   .default = col_double(),
#>   thomas_name = col_character(),
#>   st_name = col_character()
#> )
#> ℹ Use `spec()` for the full column specifications.
cel$Party <- recode(cel$dem, `1` = "Democrat", `0` = "Republican")

fig_data <- cel %>%
  group_by(Party, year) %>%
  summarize("Ideology" = mean(dwnom1, na.rm = T))
#> `summarise()` has grouped output by 'Party'. You can override using the `.groups` argument.
# View(fig_data)

ggplot(fig_data, (aes(
  x = year,
  y = Ideology,
  group = Party,
  color = Party
))) +
  geom_line() +
  scale_color_manual(values = c("blue", "red"))

10.3 Learning New Figures Part 1

# Library
library(tidyverse)

# Dummy data
x <- LETTERS[1:20]
y <- paste0("var", seq(1, 20))

# ? expand.grid
dat <- expand.grid(X = x, Y = y)

# ? runif
dat$Z <- runif(400, 0, 5)

# Heatmap
ggplot(dat, aes(x = X, y = Y, fill = Z)) +
  geom_tile()

#####practice again using a more substantive example

players <- c("Michael", "LeBron", "Kobe")
points <- c(35, 40, 45)
assists <- c(10, 12, 5)
rebounds <- c(15, 12, 5)

basketball <- tibble(players, points, assists, rebounds)

#####standardize the values

basketball$stanardize_points <-
  basketball$points / max(basketball$points)
basketball$stanardize_assists <-
  basketball$assists / max(basketball$assists)
basketball$stanardize_rebounds <-
  basketball$rebounds / max(basketball$rebounds)

basketball_stanardize <-
  select(
    basketball,
    "players",
    "stanardize_points",
    "stanardize_assists",
    "stanardize_rebounds"
  )

basketball_stanardize
#> # A tibble: 3 x 4
#>   players stanardize_points stanardize_assists stanardize_rebounds
#>   <chr>               <dbl>              <dbl>               <dbl>
#> 1 Michael             0.778              0.833               1    
#> 2 LeBron              0.889              1                   0.8  
#> 3 Kobe                1                  0.417               0.333
long_basketball_scaled <-
  pivot_longer(
    basketball_stanardize,
    c(
      "stanardize_points",
      "stanardize_assists",
      "stanardize_rebounds"
    ),
    names_to = "stat",
    values_to = "value"
  )

long_basketball_scaled
#> # A tibble: 9 x 3
#>   players stat                value
#>   <chr>   <chr>               <dbl>
#> 1 Michael stanardize_points   0.778
#> 2 Michael stanardize_assists  0.833
#> 3 Michael stanardize_rebounds 1    
#> 4 LeBron  stanardize_points   0.889
#> 5 LeBron  stanardize_assists  1    
#> 6 LeBron  stanardize_rebounds 0.8  
#> # … with 3 more rows
ggplot(long_basketball_scaled, aes(x = players, y = stat, fill = value)) +
  geom_tile()