Rstudio code for MWP

You can read about this project at
https://comfortablynumb-ers.blogspot.com/2020/02/win-probability-analysis-part-1-mean.html
follow me at @kevgk2 on Twitter
check out more rstudio guides and nflscrapR links at the bottom

#legend:
#pbp: play by play
#WP: win probability
#MWP: average win probability
#WPE: win probability expected wins
#pyth: pythagorean wins

#install packages
install.packages("nflscrapr")
install.packages("tidyverse")
install.packages("lubridate")
install.packages("na.tools")
install.packages("ggimage")

#load libraries
library(nflscrapR)
library(tidyverse)
library(lubridate)
library(na.tools)
library(ggimage)

#determining the mean WP for a particular game
#scrape games
week_14_games <- scrape_game_ids(2019, weeks = 14)

#pull play by play for specific game
Gb_Was_PbP <- week_14_games %>%
  filter(home_team == "GB") %>%
  pull(game_id) %>%
  scrape_json_play_by_play()

#omit na and select WPs
Gb_Was_WP<-Gb_Was_PbP %>%
  dplyr::select(home_wp, away_wp) %>%
  na.omit()

#determing mean and sum, put in displayable table
table(mean(Gb_Was_WP$home_wp),sum(Gb_Was_WP$home_wp))

#Creating WP analysis project
#scrape season play by play data
pbp <- read_csv(url("https://github.com/ryurko/nflscrapR-data/raw/master/play_by_play_data/regular_season/reg_pbp_2019.csv"))

#omit NA values of selection of specific columns. if you omit NA of every column, you lose A LOT of relevant data, since most of the NA data is in classifier variable columns like "if punt"
subpbp<-na.omit(dplyr::select(pbp, game_id, home_team, away_team, game_date, home_wp, away_wp))

#create schedule for later use. this finds every unique game
schedule<-unique(dplyr::select(subpbp, game_id, home_team, away_team, game_date))

#create WP stats per team per game ID, result is an away team MWP and home team MWP
wpreport<- subpbp %>%
           group_by(game_id) %>%
           summarize(mean_away = mean(away_wp), mean_home = mean(home_wp))

#merge the WP stats to the schedule
wpreport_total<-merge(schedule, wpreport)

#create expected season values for away teams, grouping by team, arranging by descending values
expectation_away<-wpreport_total %>%
group_by(away_team) %>%
summarize(exp = sum(mean_away)) %>%
arrange(desc(exp))

# repeat for home teams
expectation_home<-wpreport_total %>%
group_by(home_team) %>%
summarize(exp = sum(mean_home)) %>%
arrange(desc(exp))

#rename columns
colnames(expectation_home) <- c("team","exp1")
colnames(expectation_away) <- c("team","exp2")

#create total expected wins by adding away and home WPE for each team
totalexp<-transform(merge(expectation_home, expectation_away, by = "team"), total_expected_wins = exp1 + exp2)

#load up the data from footballoutsiders.com, mine was saved in a csv file
nfl2019results <- read.csv("~/wp/nfl2019results.csv", header=FALSE)

colnames(nfl2019results)<-c("team","wins","dvoa","pyth")

#merge other expectation values
totalexp<-merge(totalexp, nfl2019results, by = c("team","wins"))
View(totalexp)

#create regression model between actual wins and WPE
wp2019 <- lm(wins ~ wp.expected, data = expwins)

#view regression model
summary(wp2019)

Call:
lm(formula = wins ~ wp.expected, data = expwins)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3807 -0.9011 -0.1003  0.7087  3.1848 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  -7.3637     1.4688  -5.013 2.24e-05
wp.expected   1.9092     0.1806  10.570 1.24e-11

---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.5 on 30 degrees of freedom
Multiple R-squared:  0.7883, Adjusted R-squared:  0.7813 
F-statistic: 111.7 on 1 and 30 DF,  p-value: 1.24e-11

#regression for wins and dvoa wins
dvoa2019 <- lm(wins ~ expwins$`dvoa wins`, data = expwins)
summary(dvoa2019)

Call:
lm(formula = wins ~ dvoa.wins, data = expwins)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.1793 -0.3988 -0.0447  0.7547  3.1517 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.28768    0.83078  -0.346    0.732    
dvoa.wins    1.02384    0.09825  10.420 1.74e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.517 on 30 degrees of freedom
Multiple R-squared:  0.7835, Adjusted R-squared:  0.7763 
F-statistic: 108.6 on 1 and 30 DF,  p-value: 1.738e-11

#regression for pythagorean wins
pyth2019 <- lm(wins ~ pyth, data = expwins)
summary(pyth2019)

Call:
lm(formula = wins ~ pyth, data = expwins)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9218 -0.9419  0.2260  0.8704  3.2110 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.3037     0.9335  -0.325    0.747    
pyth          1.0299     0.1112   9.264 2.63e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.659 on 30 degrees of freedom
Multiple R-squared:  0.741, Adjusted R-squared:  0.7323 
F-statistic: 85.82 on 1 and 30 DF,  p-value: 2.631e-10

#export dataframe into a csv file
write.csv(expwins, "expwins")

#this section is about plotting
#plot from expwins data, aes() is just necessary commonplace. 
#x variables is expected wins from WP, y is actual wins
#geom_text provides team names
#geom_smooth provides a line, the typical least squares regression line
ggplot(expwins, aes(x=wp.expected, y=wins)) +
  geom_text(aes(label=team)) +
  geom_smooth(method = "lm")

#create the residuals between actual wins and expected
expwins<-transform(expwins, wpresidual = wins - wp.expected)

#this is for using the NFL logos in plotting
nfl_logos_df <- read_csv("https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv")

#create a new dataframe merging expected wins data and the logos
#I just called it chart b/c I ran out of names
chart <- expwins %>% left_join(nfl_logos_df, by = c("team" = "team_code"))

#create plot of WPE versus actual
#geom image adds the logos
#geom smooth is the least squared line
#geom abline adds a particular line
ggplot(chart, aes(x = wp.expected, y = wins)) +
  geom_image(aes(image = url), size = 0.05) +
  labs(x = "meanWP Expected Wins",
          y = "Actual Wins",
          caption = "graph by nflscrapr and @kevgk2",
          title = "Wins vs Expectation (meanWP)",
          subtitle = "2019") +
  theme_bw() +
  theme(axis.title = element_text(size = 12),
             axis.text = element_text(size = 10),
             plot.title = element_text(size = 16),
             plot.subtitle = element_text(size = 14),
             plot.caption = element_text(size = 12)) +
             geom_smooth(method = "lm") +
             geom_abline(slope = 1, intercept = 0)

#creating a histogram of ranking residuals
#this ranks the residual values of WPE
expwins<-arrange(expwins, wpresid)

#this sets the residual values of WPE so when we plot, it does not plot them alphabetically but a set order
expwins$team <- factor(expwins$team,levels=expwins$team)

#graphing the histogram. this uses the color dataframe from the @LeeSharpeNFL guide
ggplot(expwins,aes(x=team,y=wpresid)) +
  scale_color_manual(values=expwins$color) +
  scale_fill_manual(values=expwins$color) +
  geom_bar(stat="identity",aes(color=team,fill=team),show.legend=FALSE) +
  xlab("") +
  ylab("Wins minus expected") +
  labs(title="Wins over/under expectation per meanWP",
          caption="graph by nflscrapr and @kevgk2")

#some kind of importing error left a column named "."
expwins<-transform(expwins, dvoaresid = .)

#density plots of residuals for the three win expectation metrics
#xlim provides a specific range of x values
#geom_text makes specific text labels
ggplot(expwins, aes(x=.))+
  xlim(-5,5)+
  geom_density(color = "green")+
  geom_text(label = "dvoa", color = "green", x= 1, y= 0.4)+
  theme_light()+
  geom_density(aes(x=wpresid), color = "blue") +
  geom_text(label = "WP", color = "blue", x= -2, y= 0.15)+
  geom_density(aes(x=pythresid), color = "red")+
  geom_text(label = "pythag", color = "red", x= 1.25, y= 0.21)+
    labs(
           title="Smoothed density of residuals for predicted wins vs actual",
           caption="by nflscrapr and @kevgk2")

#calculating R^2 from wins = a + b(expected wins) where a = 0, and b = 1
#this lets us interpret the error better than regression
1 - (sum((check$wp.expected - check$wins)^2)/(sum((check$wins - mean(check$wins))^2)))
[1] 0.608686
1 - (sum((check$dvoa.wins - check$wins)^2)/(sum((check$wins - mean(check$wins))^2)))
[1] 0.782159
1 - (sum((check$pyth - check$wins)^2)/(sum((check$wins - mean(check$wins))^2)))
[1] 0.7399274

#looking at predictability for first 8 weeks predicting following 9
#same method as above for WPE but filter first and do it twice
#filtering by game date to collect first 8 weeks of games
wpreport_total_week8<-wpreport_total %>%
 select(game_id, home_team, away_team, game_date, mean_away, mean_home) %>%
 filter(game_date < as.Date("2019-11-03"))

#collecting final 9 weeks
wpreport_total_week17<-wpreport_total %>%
 select(game_id, home_team, away_team, game_date, mean_away, mean_home) %>%
 filter(game_date >= as.Date("2019-11-03"))

#create expected season values for away teams, grouping by team, arranging by descending values
expectation_away_week8<-wpreport_total_week8 %>%
  group_by(away_team) %>%
  summarize(exp = sum(mean_away)) %>%
  arrange(desc(exp))

expectation_home_week8<-wpreport_total_week8 %>%
  group_by(home_team) %>%
  summarize(exp = sum(mean_home)) %>%
  arrange(desc(exp))

colnames(expectation_home_week8) <- c("team","exp1")
colnames(expectation_away_week8) <- c("team","exp2")

#create total expected wins by adding away and home for each team
week_exp<-transform(merge(expectation_home_week8, expectation_away_week8, by = "team"), total_expected_wins_week8 = exp1 + exp2)

expectation_away_week17<-wpreport_total_week17 %>%
  group_by(away_team) %>%
  summarize(exp = sum(mean_away)) %>%
  arrange(desc(exp))

expectation_home_week17<-wpreport_total_week17 %>%
  group_by(home_team) %>%
  summarize(exp = sum(mean_home)) %>%
  arrange(desc(exp))

colnames(expectation_home_week17) <- c("team","exp1")
colnames(expectation_away_week17) <- c("team","exp2")

week_exp17<-transform(merge(expectation_home_week17, expectation_away_week17, by = "team"), total_expected_wins_week17 = exp1 + exp2)

#merge the expected wins for first 8 and last 9 wins
week_exp<-merge(week_exp, select(week_exp17, team, total_expected_wins_week17), by = "team")

#find correlation between expectations
cor(week_exp$total_expected_wins_week8,week_exp$total_expected_wins_week17)
[1] 0.1365074

#this section is about tweaking the expected wins per first 8 weeks model, but instead finds the total MWP for first 8 weeks and following 9 to fix inconsistent bye week issues
awaywp_week8<- subpbp %>%
  filter(game_date < as.Date("2019-11-03")) %>%
  group_by(away_team) %>%
  summarize(sumaway = sum(away_wp), countaway=length(away_wp))

homewp_week8<- subpbp %>%
  filter(game_date < as.Date("2019-11-03")) %>%
  group_by(home_team) %>%
  summarize(sumhome = sum(home_wp), counthome=length(home_wp))

awaywp_week17<- subpbp %>%
  filter(game_date >= as.Date("2019-11-03")) %>%
  group_by(away_team) %>%
  summarize(sumaway = sum(away_wp), countaway=length(away_wp))

homewp_week17<- subpbp %>%
  filter(game_date >= as.Date("2019-11-03")) %>%
  group_by(home_team) %>%
  summarize(sumhome = sum(home_wp), counthome=length(home_wp))

awp_week8<-merge(awaywp_week8, homewp_week8, by.x = "away_team", by.y = "home_team")

awp_week8<-transform(awp_week8, total= (sumaway+sumhome)/(countaway+counthome))

awp_week17<-merge(awaywp_week17, homewp_week17, by.x = "away_team", by.y = "home_team")

awp_week17<-transform(awp_week17, total= (sumaway+sumhome)/(countaway+counthome))

awp_week8<-select(awp_week8, away_team, total)

awp_week17<-select(awp_week17, away_team, total)

colnames(awp_week8)<-c("team","awp_week8")
colnames(awp_week17)<-c("team","awp_week17")

awp_perweek<-merge(awp_week8,awp_week17, by = "team")

cor(awp_perweek$awp_week8, awp_perweek$awp_week17)
[1] 0.2302562

#merge logos for plotting
awp_perweek<-merge(awp_perweek, nfl_logos_df, by.x = "team", by.y = "team_code")

#create a plot of first 8 weeks and last 9
ggplot(awp_perweek, aes(x = awp_week8, y = awp_week17)) +
  geom_image(aes(image = url), size = 0.05) +
  labs(x = "meanWP through first 8 weeks",
       y = "meanWP through first 17 weeks",
       caption = "graph by nflscrapr and @kevgk2",
       title = "Mean WP for first 8 weeks and following 9",
       subtitle = "2019") +
  theme_bw() +
  theme(axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        plot.title = element_text(size = 16),
        plot.subtitle = element_text(size = 14),
        plot.caption = element_text(size = 12))+
        geom_abline(slope = 1, intercept = 0)

https://gist.github.com/guga31bb/5634562c5a2a7b1e9961ac9b6c568701
https://github.com/leesharpe/nfldata/blob/master/RSTUDIO-INTRO.md
https://github.com/leesharpe/nfldata/blob/master/WPCHARTS.md
https://github.com/leesharpe/nfldata/blob/master/COLORS_IN_R.md
https://github.com/maksimhorowitz/nflscrapR/blob/master/R/ep_wp_calculator.R
https://github.com/maksimhorowitz/nflscrapR
https://www.dropbox.com/s/5k2wbnroyn8i1ux/Getting%20Started%20With%20R%20for%20NFL.pdf?dl=0

Comments

Popular posts from this blog

Profiling 2019 NFL Offenses with nflscrapR Data and Clustering

Using the Excel Nonlinear Solver to Optimize Skill Trees with Borderlands 3 Example

Jordan Love Was The Right Pick In Theory