Moneyball in R

A Quick and Incomplete Introduction

Author
Published

March 30, 2023


This post is currently under construction. In its current form, it serves to give STAT 385 students access to some code that appears in a lecture video. It will be expanded over time.

Welcome to Opening Day 2023! This post will serve as a (very quick and incomplete) introduction to “Moneyball” using R. For students in STAT 385, it will help reinforce various concepts from the tidyverse.

To set the stage, we recommend the following video:

Setup

library("tidyverse")
library("ggdensity")
library("ggridges")
library("Lahman")
library("DT")
# install.packages("remotes")
remotes::install_github("daviddalpiaz/bbd")

All-Time Homerun Leaders

Batting |>
  select(playerID, yearID, teamID, HR) |>
  summarise(HR = sum(HR), .by = playerID) |>
  filter(HR >= 100) |>
  arrange(desc(HR)) |>
  left_join(People, by = "playerID") |>
  select(playerID, nameFirst, nameLast, HR) |>
  unite(Name, nameFirst:nameLast, sep = " ") |>
  left_join(
    summarise(HallOfFame, inHall = "Y" %in% inducted, .by = playerID),
    by = "playerID"
  ) |> 
  datatable()

Pythagorean Theorem of Baseball

Screen capture from Moneyball (2011) showing Jonah Hill’s character Peter Brand pointing to an example of the Pythagorean Theorem of Baseball applied to the 2022 Oakland Athletics on a dry erase board.
Teams |>
  select(yearID, teamID, W, L, R, RA) |>
  arrange(desc(yearID)) |>
  filter(yearID > 1990) |>
  mutate(pythWPerc = (R ^ 2) / (R ^ 2 + RA ^ 2)) |>
  mutate(pythW = round(pythWPerc * 162)) |>
  mutate(weirdSeason = case_when(
    yearID == 2020 ~ "2020, COVID",
    yearID == 1994 ~ "1994, Strike",
    .default = "Normal")
  ) |>
  ggplot() +
  aes(x = W, y = pythW, color = weirdSeason) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_color_brewer(palette = "Set1") + 
  theme_bw()
`geom_smooth()` using formula = 'y ~ x'

  • TODO: do only 2021
  • TODO: multiply by games instead of 162

Pitching Metrics

bbd::people_search("Dylan Cease")
# A tibble: 6 × 5
  name            birth_year key_mlbam key_bbref   key_fangraphs
  <chr>                <int>     <int> <chr>               <int>
1 Dylan Chavez          1991    605178 ""                     NA
2 Dylan Castaneda       2001    688341 ""                     NA
3 Dylan Neuse           1998    681540 ""                     NA
4 Dylan Cease           1995    656302 "ceasedy01"         18525
5 Dylan Beavers         2001    687637 ""                     NA
6 Dylan Beasley         1997    670188 ""                     NA
sc2022 = bbd::statcast_bbd(start = "2022-01-01", end = "2022-12-13", verbose = TRUE)
cease2022 = filter(sc2022, pitcher == 656302)
saveRDS(cease2022, file = "data/cease2022.rds")
cease2022 = readRDS("data/cease2022.rds")
geom_strikezone = function(color = "black", linewidth = 0.25, sz_top = 3.8, sz_bot = 1.1) {
  sz_left = -0.85
  sz_right = 0.85
  strikezone = data.frame(
    x = c(sz_left, sz_left, sz_right, sz_right, sz_left),
    y = c(sz_bot, sz_top, sz_top, sz_bot, sz_bot)
  )
  geom_path(aes(.data$x, .data$y), data = strikezone, linewidth = linewidth, col = color)
}
cease2022 |> 
  filter(stand == "R") |>
  filter(!is.na(pitch_type)) |> 
  ggplot() + 
  aes(x = plate_x, y = plate_z) + 
  geom_hdr(
    method = method_kde(), 
    probs = c(0.95, 0.75, 0.50, 0.25, 0.05)
  ) + 
  geom_strikezone() + 
  geom_point(
    aes(color = pitch_type), 
    alpha = 0.4
  ) +
  theme_bw() + 
  theme(
    aspect.ratio = 1, 
    legend.position = c(1, 0),
    legend.justification = c(1, 0), 
    legend.box = "horizontal"
  ) + 
  scale_color_brewer(palette = "Set1") + 
  facet_wrap(vars(pitch_type))

cease2022 |> 
  filter(!is.na(pitch_type)) |> 
  ggplot() + 
  aes(x = release_speed, y = pitch_type, fill = pitch_type) + 
  geom_density_ridges() + 
  scale_fill_brewer(palette = "Set1") + 
  theme_bw()
Picking joint bandwidth of 0.472

cease2022 |> 
  filter(!is.na(pitch_type)) |> 
  ggplot() + 
  aes(x = release_speed, y = release_spin_rate) + 
  geom_point(aes(color = pitch_type), alpha = 0.4, size = 2) +
  theme_bw() + 
  scale_color_brewer(palette = "Set1")

cease2022 |> 
  filter(!is.na(pitch_type)) |> 
  summarise(
    n = n(), 
    velo = round(mean(release_speed), digits = 1),
    spin = round(mean(release_spin_rate), digits = 1),
    h_move = round(mean(pfx_x), digits = 2),
    v_move = round(mean(pfx_z), digits = 2),
    .by = c(pitch_type, pitch_name)) |> 
  arrange(desc(n)) |> 
  datatable(
    rownames = FALSE, 
    options = list(dom = 't'),
    colnames = c(
      "Pitch Type" = "pitch_type",
      "Pitch Name" = "pitch_name",
      "Number Thrown" = "n",
      "Average Velocity" = "velo",
      "Average Spin" = "spin",
      "Average Horizontal Movement" = "h_move",
      "Average Vertical Movement" = "v_move"
    )
  )

Baseball Analytics at Illinois