Accessing Baseball Data for Analysis

Sources and Methods

Author
Published

October 4, 2022


Warning

This post is currently a work-in-progress. Use this resource outside of STAT 430 Baseball Analytics at Illinois with extreme caution.

Background

Sources

Methods

  • Flat Files
    • Plain text: txt, csv, tsv
    • Compressed: zip, gzip, zst
    • Binary: rds, fst
  • API (Application Programming Interface)
  • Database
  • Web Scraping

R Pacakges

baseballr

bbd

Structure

  • What is the smallest possible unit of analysis?
  • What is a row of the data?

Scraping Example

library("dplyr")
library("rvest")
url = "https://www.baseball-reference.com/players/m/marisro01.shtml"
maris = html_table(x = read_html(x = url), )[[1]]
maris_mlb = maris |> 
  filter(Lg %in% c("AL", "NL")) |> 
  mutate(Year = as.integer(Year),
         Age = as.integer(Age))
maris_mlb
# A tibble: 14 × 30
    Year   Age Tm    Lg        G    PA    AB     R     H  `2B`  `3B`    HR   RBI
   <int> <int> <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1  1957    22 CLE   AL      116   424   358    61    84     9     5    14    51
 2  1958    23 TOT   AL      150   637   583    87   140    19     4    28    80
 3  1958    23 CLE   AL       51   202   182    26    41     5     1     9    27
 4  1958    23 KCA   AL       99   435   401    61    99    14     3    19    53
 5  1959    24 KCA   AL      122   499   433    69   118    21     7    16    72
 6  1960    25 NYY   AL      136   578   499    98   141    18     7    39   112
 7  1961    26 NYY   AL      161   698   590   132   159    16     4    61   141
 8  1962    27 NYY   AL      157   687   590    92   151    34     1    33   100
 9  1963    28 NYY   AL       90   351   312    53    84    14     1    23    53
10  1964    29 NYY   AL      141   584   513    86   144    12     2    26    71
11  1965    30 NYY   AL       46   186   155    22    37     7     0     8    27
12  1966    31 NYY   AL      119   391   348    37    81     9     2    13    43
13  1967    32 STL   NL      125   472   410    64   107    18     7     9    55
14  1968    33 STL   NL      100   340   310    25    79    18     2     5    45
# … with 17 more variables: SB <int>, CS <int>, BB <int>, SO <int>, BA <dbl>,
#   OBP <dbl>, SLG <dbl>, OPS <dbl>, `OPS+` <int>, TB <int>, GDP <int>,
#   HBP <int>, SH <int>, SF <int>, IBB <int>, Pos <chr>, Awards <chr>

Package Example

library("dplyr")
war_pitcher = bbd::bref_war_daily_batter()
war_batter  = bbd::bref_war_daily_pitcher()
war_pitcher_2022_nl = war_pitcher |>
  filter(year_ID == 2022) |> 
  filter(lg_ID == "NL") |> 
  group_by(player_ID, name_common) |> 
  summarise(war = sum(WAR), .groups = "drop")

war_batter_2022_nl = war_batter |>
  filter(year_ID == 2022) |> 
  filter(lg_ID == "NL") |> 
  group_by(player_ID, name_common) |> 
  summarise(war = sum(WAR), .groups = "drop")
war_pitcher_2022_al = war_pitcher |>
  filter(year_ID == 2022) |> 
  filter(lg_ID == "AL") |> 
  group_by(player_ID, name_common) |> 
  summarise(war = sum(WAR), .groups = "drop")

war_batter_2022_al = war_batter |>
  filter(year_ID == 2022) |> 
  filter(lg_ID == "AL") |> 
  group_by(player_ID, name_common) |> 
  summarise(war = sum(WAR), .groups = "drop")
war_2022_nl = rbind(war_pitcher_2022_nl, war_batter_2022_nl)
war_2022_al = rbind(war_pitcher_2022_al, war_batter_2022_al)
war_2022_nl = war_2022_nl |> arrange(desc(war))
war_2022_al = war_2022_al |> arrange(desc(war))
war_2022_al
# A tibble: 857 × 3
   player_ID name_common        war
   <chr>     <chr>            <dbl>
 1 judgeaa01 Aaron Judge      10.7 
 2 gimenan01 Andres Gimenez    7.19
 3 alvaryo01 Yordan Alvarez    6.69
 4 ceasedy01 Dylan Cease       6.36
 5 troutmi01 Mike Trout        6.16
 6 ohtansh01 Shohei Ohtani     5.95
 7 verlaju01 Justin Verlander  5.94
 8 manoaal01 Alek Manoah       5.91
 9 rodriju01 Julio Rodriguez   5.87
10 semiema01 Marcus Semien     5.65
# … with 847 more rows
  • TODO: Spot the problem with this data!

Database Example

Download example database here!

library("dplyr")
library("DBI")
library("reactable")
  • TODO: Add examples of listing all tables in the database.
con = DBI::dbConnect(duckdb::duckdb(), dbdir = "data/statcast.db")
statcast = tbl(src = con, "sc")
statcast |>
  filter(batter_name %in% c("Aaron Judge", "Albert Pujols")) |>
  filter(events == "home_run") |>
  select(
    game_date,
    batter_name,
    stand,
    pitcher_name,
    p_throws,
    pitch_name,
    release_speed,
    release_spin_rate,
    hit_distance_sc,
    launch_speed,
    launch_angle
  ) |>
  collect() |>
  reactable(
    pagination = FALSE,
    columns = list(
      game_date = colDef(name = "Date"),
      batter_name = colDef(name = "Hitter"),
      stand = colDef(name = "Stands"),
      pitcher_name = colDef(name = "Pitcher"),
      p_throws = colDef(name = "Throws"),
      pitch_name = colDef(name = "Pitch Type"),
      release_speed = colDef(name = "Pitch Velocity"),
      release_spin_rate = colDef(name = "Pitch Spin"),
      hit_distance_sc = colDef(name = "Hit Distance"),
      launch_speed = colDef(name = "Exit Velocity"),
      launch_angle = colDef(name = "Launch Angle")
    )
  )
DBI::dbDisconnect(conn = con, shutdown = TRUE)
  • TODO: Demonstrate dplyr::show_query().