library("dplyr")
library("rvest")
Accessing Baseball Data for Analysis with R
Sources and Methods
Background
The sport of baseball has a long history of data collection dating back to the first “box scores” recorded by Henry Chadwick.
Sources
Today, baseball provides a rich source of data for analysis.
Methods
Flat Files
- Flat File
- Plain text:
txt
,csv
,tsv
- Compressed:
zip
,gzip
,zst
- Plain text:
- https://en.m.wikipedia.org/wiki/Plain_text
- https://en.m.wikipedia.org/wiki/Data_compression
Serialization
- Serialized Object
- Binary:
rds
,fst
- Binary:
Application Programming Interfaces
Databases
Web Scraping
R Pacakges
baseballr
bbd
Structure
- What is the smallest possible unit of analysis?
- What is a row of the data?
Scraping Example
= "https://www.baseball-reference.com/players/m/marisro01.shtml"
url = html_table(x = read_html(x = url), )[[1]]
maris = {
maris_mlb |>
maris filter(Lg %in% c("AL", "NL")) |>
mutate(Year = as.integer(Year),
Age = as.integer(Age))
}
::datatable(maris_mlb) DT
Package Example
library("dplyr")
= bbd::bref_war_daily_batter()
war_pitcher = bbd::bref_war_daily_pitcher() war_batter
= {
war_pitcher_2022_nl |>
war_pitcher filter(year_ID == 2022, lg_ID == "NL") |>
summarise(war = sum(WAR), .by = c(player_ID, name_common))
}
= {
war_batter_2022_nl |>
war_batter filter(year_ID == 2022, lg_ID == "NL") |>
summarise(war = sum(WAR), .by = c(player_ID, name_common))
}
= {
war_pitcher_2022_al |>
war_pitcher filter(year_ID == 2022, lg_ID == "AL") |>
summarise(war = sum(WAR), .by = c(player_ID, name_common))
}
= {
war_batter_2022_al |>
war_batter filter(year_ID == 2022, lg_ID == "AL") |>
summarise(war = sum(WAR), .by = c(player_ID, name_common))
}
= rbind(war_pitcher_2022_nl, war_batter_2022_nl)
war_2022_nl = rbind(war_pitcher_2022_al, war_batter_2022_al) war_2022_al
= arrange(war_2022_nl, desc(war))
war_2022_nl = arrange(war_2022_al, desc(war)) war_2022_al
::datatable(war_2022_al) DT
Do you notice anything odd about this data?
Database Example
2022 Home Run Races
library("dplyr")
library("DBI")
- TODO: Add examples of listing all tables in the database.
= DBI::dbConnect(duckdb::duckdb(), dbdir = "data/statcast.db")
con = tbl(src = con, "sc") statcast
= {
hr_races_2022 |>
statcast filter(batter_name %in% c("Aaron Judge", "Albert Pujols")) |>
filter(events == "home_run") |>
select(
game_date,
batter_name,
stand,
pitcher_name,
p_throws,
pitch_name,
release_speed,
release_spin_rate,
hit_distance_sc,
launch_speed,
launch_angle
) }
show_query(hr_races_2022)
<SQL>
SELECT
game_date,
batter_name,
stand,
pitcher_name,
p_throws,
pitch_name,
release_speed,
release_spin_rate,
hit_distance_sc,
launch_speed,
launch_angle
FROM sc
WHERE
(batter_name IN ('Aaron Judge', 'Albert Pujols')) AND
(events = 'home_run')
= collect(hr_races_2022) hr_races_2022
::datatable(
DT
hr_races_2022,colnames = c(
"Date",
"Hitter",
"Stands",
"Pitcher",
"Throws",
"Pitch Type",
"Pitch Velocity",
"Pitch Spin",
"Hit Distance",
"Exit Velocity",
"Launch Angle"
) )
::dbDisconnect(conn = con, shutdown = TRUE) DBI
Lahman
library("dbplyr")
library("dplyr")
library("DBI")
= dbplyr::lahman_sqlite()
lahman_db ::dbListTables(lahman_db) DBI
[1] "AllstarFull" "Appearances" "AwardsManagers"
[4] "AwardsPlayers" "AwardsShareManagers" "AwardsSharePlayers"
[7] "Batting" "BattingPost" "CollegePlaying"
[10] "Fielding" "FieldingOF" "FieldingOFsplit"
[13] "FieldingPost" "HallOfFame" "HomeGames"
[16] "LahmanData" "Managers" "ManagersHalf"
[19] "Parks" "People" "Pitching"
[22] "PitchingPost" "Salaries" "Schools"
[25] "SeriesPost" "Teams" "TeamsFranchises"
[28] "TeamsHalf" "sqlite_stat1" "sqlite_stat4"