Chapter 9 Appendix

9.1 Graphs

9.1.1 Lahman

Batting %>% 
  group_by(yearID) %>% 
  summarize(totalHR = sum(HR)) %>% 
  filter(yearID >= 1990) %>% 
  ggplot(aes(x = yearID, y = totalHR, fill = factor(yearID))) +
  geom_col() +
  labs(title = "Total Home Runs from 1990-2021", x = "Year", y = "Total HRs") +
  theme_bw() +
  theme(text = element_text(family = "serif")) +
  theme(legend.position = "none")

Pitching %>% 
  group_by(yearID) %>% 
  filter(yearID >= 1921, !is.na(ERA)) %>% 
  summarize(avgERA = mean(ERA)) %>% 
  ggplot(aes(x = yearID, y = avgERA)) + 
  geom_line(size = 1, color = "dodgerblue") +
  labs(title = "Average ERA in the Last 100 Years",
       x = "Year", y = "Average ERA") +
  theme_bw() +
  theme(text = element_text(family = "serif")) 

Fielding %>% 
  filter(yearID >= 2012) %>% 
  ggplot(aes(x = factor(POS), y = DP)) +
  geom_jitter(alpha = 0.4, aes(color = factor(yearID)), 
              position = position_jitter(0.2)) +
  labs(title = "Number of Double Plays by Position", 
       x = "Position", y = "Double Plays", color = "Year") +
  theme_bw() +
  theme(text = element_text(family = "serif")) 

Salaries %>% 
  group_by(yearID) %>% 
  summarize(Low = min(salary), Median = median(salary), 
                     Mean = mean(salary), High = max(salary)) %>% 
  ggplot(aes(x = yearID)) +
  geom_ribbon(aes(ymin = Low, ymax = High), 
                        fill = "lightblue", color = "black", alpha = 0.5) +  
  geom_line(aes(y = Median), color = "black", linetype = "dashed") + 
  geom_line(aes(y = Mean), color = "black", linetype = "dotted") + 
  scale_y_continuous(labels = scales::label_dollar()) + 
  labs(title = "MLB Salary Range Over Time", 
          subtitle = "Median = Dashed Line, Mean = Dotted Line",       
          y = "Salary", 
          x = "Season") +
  theme_bw() +
  theme(text = element_text(family = "serif")) 

9.1.2 FanGraphs

fg_update %>% 
  ggplot(aes(x = BB_rate, y = K_rate, label = Team, fill = WAR)) + 
  geom_label(alpha = 0.6, size = 3, family = "serif") +
  labs(title = "K% and BB% For Each MLB Team", 
       x = "Walk Rate(%)", y = "Strikeout Rate(%)") +
  scale_fill_viridis_c() +
  theme_bw() +
  theme(text = element_text(family = "serif"))

judge_filtered %>% 
  ggplot(aes(x = EV, LA, color = factor(RBI))) +
  geom_point(alpha = 0.6) +
  labs(title = "Aaron Judge Batting", 
       x = "Exit Velocity", y = "Launch Angle", color = "RBI") +
  scale_color_viridis_d() +
  theme_bw() +
  theme(text = element_text(family = "serif"))

# pl_filtered %>% 
#   ggplot(aes(x = factor(Season), y = ERA)) +
#   geom_jitter(alpha = 0.6, aes(color = Age),
#               position=position_jitter(0.2)) +
#   labs(title = "Pitchers", x = "Season") +
#   scale_color_viridis_b() +
#   theme_bw() +
#   theme(text = element_text(family = "serif"))

9.1.3 Statcast

sc_download %>% 
  ggplot(aes(x = xwoba)) +
  geom_density(fill = "cornflowerblue", alpha = .5) +
  theme_bw() +
  theme(text = element_text(family = "serif"))

judge_sc %>% 
  group_by(description, pitch_type) %>% 
  summarize(count = n()) %>% 
  ggplot(aes(x = description, y = pitch_type, fill = count)) +
  geom_tile() + 
  labs(x = "Description", y = "Pitch Type", color = "Release Speed") +
  scale_fill_viridis_b() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(text = element_text(family = "serif"))

sc_lead_evb %>% 
  group_by(player_id) %>% 
  ggplot(aes(x = max_distance, y = max_hit_speed, color = avg_distance)) +
  geom_point(alpha = .5, aes(size = avg_hit_speed)) +
  geom_label(data = sc_lead_evb %>% 
               filter(max_distance %in% range(max_distance) | 
                        max_hit_speed %in% range(max_hit_speed)),
             aes(label = `last_name, first_name`), size = 3,
             show.legend = FALSE, family = "serif") +
  labs(title = "Hitting Leaders", x = "Max Distance", y = "Max Hit Speed",
       color = "Average Distance", size = "Average Hit Speed") +
  scale_color_viridis_b() +
  theme_bw() +
  theme(text = element_text(family = "serif"))

sc_lead_exp %>% 
  ggplot(aes(x = ba, y = est_ba, color = pa)) + 
  geom_point(alpha = .5) +
  geom_abline() +
  labs(x = "Batting Average", y = "Expected Batting Average",
       color = "Plate Appearances") +
  scale_color_viridis_b() +
  theme_bw() +
  theme(text = element_text(family = "serif"))

9.1.4 Restructuring Data

join_all_keys %>% 
  ggplot(aes(x = IPouts, y = salary, color = IPouts)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "Pitcher Salary by Outs Pitched", 
       x = "Outs Pitched", y = "Salary") +
  scale_color_viridis_c() +
  scale_y_continuous(labels = scales::label_dollar()) +
  theme_bw() +
  theme(legend.position = "none") +
  theme(text = element_text(family = "serif"))

9.1.5 Modeling

team_pred %>% 
  ggplot(aes(x = predict(model2), y = R)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, color = "cornflowerblue", size = 1) +
  labs(x = "Predicted Runs", y = "Actual Runs") +
  theme_bw() +
  theme(text = element_text(family = "serif")) 

ggplot(all_draft, aes(x = OvPck, y = prediction)) +
  geom_line(linewidth = 1) +
  theme_bw() +
  lims(y = c(0, 1)) +
  labs(title = "Chances a First Round Draft Pick Achieves Positive WAR",
       caption = "Data collected from Baseball Reference (2004-2013)",
       x = "Overall Pick Number",
       y = "Predicted Probability")

ggplot(all_draft, aes(x = OvPck, y = prediction2, color = Type)) +
  geom_line(linewidth = 1) +
  theme_bw() +
  lims(y = c(0, 1)) +
  labs(title = "Chances a First Round Draft Pick Achieves Positive WAR by Draft Type",
       caption = "Data collected from Baseball Reference (2004-2013)",
       x = "Overall Pick Number",
       y = "Predicted Probability")

9.2 Functions

9.2.1 Data Scraping and Functions

scrape_draft <- function(year, round) {
  
  require(rvest)
  
  url <- paste0("https://www.baseball-reference.com/draft/?year_ID=",
                year,
                "&draft_round=",
                round,
                "&draft_type=junreg&query_type=year_round&from_type_jc=0&from_type_hs=0&from_type_4y=0&from_type_unk=0")
  
  data <- url %>%
    read_html()
  
  draft_data <- data %>%
    html_element("table") %>%
    html_table()
  draft_data
}

9.2.2 Visualizations

geom_zone <- function(top = 3.75, bottom = 1.5, linecolor = "gray60"){
  geom_rect(xmin = -.7083, xmax = .7083, ymin = bottom, ymax = top,
            alpha = 0, color = linecolor, linewidth = 1.5)
}
geom_plate <- function(){
  df <- data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, -.25, -.5, -.25))
  g <- geom_polygon(data = df, aes(x = x, y = y), fill = "white", color = "gray60", linewidth = 1.25)
  g
}
geom_spray <- function(...) {
  ggplot(...) +
    geom_curve(x = 33, xend = 223, y = -100, yend = -100, curvature = -.65) +
    geom_segment(x = 128, xend = 33, y = -208, yend = -100) +
    geom_segment(x = 128, xend = 223, y = -208, yend = -100) +
    geom_curve(x = 83, xend = 173, y = -155, yend = -156, curvature = -.65, linetype = "dotted") +
    coord_fixed() +
    scale_x_continuous(NULL, limits = c(25, 225)) +
    scale_y_continuous(NULL, limits = c(-225, -25))
}