# importing libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## Warning: package 'stringr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rcompanion)
#importing dataset
df=read_csv("C:/Users/dgray/OneDrive/Stats/stat 410 regression/project/NFL combine data.csv")
## Rows: 3477 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Player, School, Drafted..tm.rnd.yr., Player_Type, Position_Type, P...
## dbl (11): Year, Age, Height, Weight, Sprint_40yd, Vertical_Jump, Bench_Press...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 3,477
## Columns: 18
## $ Year <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 20…
## $ Player <chr> "Beanie Wells\\WellCh00", "Will Davis\\DaviWi99", …
## $ Age <dbl> 20, 22, 24, 23, 22, 23, 24, 21, 23, 22, 23, 21, 22…
## $ School <chr> "Ohio St.", "Illinois", "LSU", "Alabama", "Connect…
## $ Height <dbl> 1.8542, 1.8796, 2.0066, 1.8034, 1.8796, 1.9304, 1.…
## $ Weight <dbl> 106.59421, 118.38761, 165.10762, 92.07925, 110.676…
## $ Sprint_40yd <dbl> 4.38, 4.84, 5.50, 4.49, 4.76, 5.28, 4.98, 5.32, 4.…
## $ Vertical_Jump <dbl> 85.09, 83.82, NA, 93.98, 92.71, NA, NA, 55.88, 88.…
## $ Bench_Press_Reps <dbl> 25, 27, 21, 15, 26, 29, NA, 19, 28, 14, 16, 29, 21…
## $ Broad_Jump <dbl> 325.12, 292.10, NA, 304.80, 304.80, NA, NA, 238.76…
## $ Agility_3cone <dbl> NA, 7.38, NA, 7.09, 7.10, NA, NA, 7.87, 7.46, 6.93…
## $ Shuttle <dbl> NA, 4.45, NA, 4.23, 4.40, NA, NA, 4.88, 4.43, 4.16…
## $ Drafted..tm.rnd.yr. <chr> "Arizona Cardinals / 1st / 31st pick / 2009", "Ari…
## $ BMI <dbl> 31.00419, 33.51007, 41.00582, 28.31246, 31.32742, …
## $ Player_Type <chr> "offense", "defense", "offense", "defense", "defen…
## $ Position_Type <chr> "backs_receivers", "defensive_lineman", "offensive…
## $ Position <chr> "RB", "DE", "OG", "FS", "OLB", "OG", "DT", "OT", "…
## $ Drafted <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
# showing all position values
unique(df$Position)
## [1] "RB" "DE" "OG" "FS" "OLB" "DT" "OT" "CB" "SS" "TE" "ILB" "C"
## [13] "FB" "WR" "P" "K" "QB" "LS" "S" "DB"
# subsetting to include only Rb, WR, CB, and SS position groups
df1 = df %>%
filter(Position=="WR" | Position=="CB" | Position=="SS" | Position=="RB")
glimpse(df1)
## Rows: 1,327
## Columns: 18
## $ Year <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 20…
## $ Player <chr> "Beanie Wells\\WellCh00", "Chris Owens\\OwenCh99",…
## $ Age <dbl> 20, 22, 23, 23, 22, 22, 21, 21, 20, 24, 22, 21, 21…
## $ School <chr> "Ohio St.", "San Jose St.", "Missouri", "Nicholls …
## $ Height <dbl> 1.8542, 1.7780, 1.8288, 1.7780, 1.7526, 1.7780, 1.…
## $ Weight <dbl> 106.59421, 82.10022, 100.24391, 81.19303, 97.97595…
## $ Sprint_40yd <dbl> 4.38, 4.44, 4.49, 4.35, 4.34, 4.68, 4.66, 4.43, 4.…
## $ Vertical_Jump <dbl> 85.09, 87.63, 93.98, 92.71, 101.60, NA, NA, 100.33…
## $ Bench_Press_Reps <dbl> 25, 14, 16, 15, 27, NA, 18, 14, 19, 12, NA, 17, NA…
## $ Broad_Jump <dbl> 325.12, 279.40, 312.42, 307.34, 297.18, NA, NA, 29…
## $ Agility_3cone <dbl> NA, 6.93, 6.81, 6.77, 6.99, NA, NA, 6.89, 7.15, 6.…
## $ Shuttle <dbl> NA, 4.16, 4.26, 4.10, 4.29, NA, NA, 4.22, 4.20, 3.…
## $ Drafted..tm.rnd.yr. <chr> "Arizona Cardinals / 1st / 31st pick / 2009", "Atl…
## $ BMI <dbl> 31.00419, 25.97053, 29.97268, 25.68356, 31.89730, …
## $ Player_Type <chr> "offense", "defense", "defense", "defense", "offen…
## $ Position_Type <chr> "backs_receivers", "defensive_back", "defensive_ba…
## $ Position <chr> "RB", "CB", "SS", "CB", "RB", "CB", "CB", "RB", "C…
## $ Drafted <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
# converting from metric units to american units
df1 = df1 %>%
mutate(Weight= 2.20462*(Weight),
Height= 39.3701*(Height),
Vertical_Jump = 0.393701*(Vertical_Jump),
Broad_Jump= 0.393701*(Broad_Jump))
df1
## # A tibble: 1,327 × 18
## Year Player Age School Height Weight Sprin…¹ Verti…² Bench…³ Broad…⁴
## <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2009 "Beanie Wel… 20 Ohio … 73.0 235. 4.38 33.5 25 128.
## 2 2009 "Chris Owen… 22 San J… 70.0 181. 4.44 34.5 14 110.
## 3 2009 "William Mo… 23 Misso… 72.0 221. 4.49 37.0 16 123.
## 4 2009 "Lardarius … 23 Nicho… 70.0 179. 4.35 36.5 15 121.
## 5 2009 "Cedric Pee… 22 Virgi… 69.0 216. 4.34 40.0 27 117.
## 6 2009 "Jairus Byr… 22 Oregon 70.0 207. 4.68 NA NA NA
## 7 2009 "Cary Harri… 21 USC 71.0 187. 4.66 NA 18 NA
## 8 2009 "Mike Goods… 21 Texas… 72.0 208. 4.43 39.5 14 118.
## 9 2009 "Captain Mu… 20 South… 68.0 182. 4.41 37.5 19 121.
## 10 2009 "Sherrod Ma… 24 Troy 73.0 198. 4.43 36.0 12 123.
## # … with 1,317 more rows, 8 more variables: Agility_3cone <dbl>, Shuttle <dbl>,
## # Drafted..tm.rnd.yr. <chr>, BMI <dbl>, Player_Type <chr>,
## # Position_Type <chr>, Position <chr>, Drafted <chr>, and abbreviated
## # variable names ¹Sprint_40yd, ²Vertical_Jump, ³Bench_Press_Reps, ⁴Broad_Jump
# droppin null values
df1=drop_na(df1)
# histogram of response variable
plotNormalHistogram(df1$Sprint_40yd)
The above histogram resembles a bell-shaped curve, which signifies that it is normally distributed
# testing normality of response via shapiro test of 40 yd dash times
shapiro.test(df1$Sprint_40yd)
##
## Shapiro-Wilk normality test
##
## data: df1$Sprint_40yd
## W = 0.99616, p-value = 0.3671
# testing normality of response via histogram of 40 yd dash times
ggplot(df1, aes(x=Sprint_40yd)) +
geom_histogram(bins=18, color="black", fill="orange")
shapiro test gives us a p-value of 0.3671, which is not in our rejection range. So we can conclude that the response variable is normally distributed.
# setting reference levels
Position_ = relevel(as.factor(df1$Position), ref="WR")
# fitting general linear regreession
summary(fitted.model<-glm(Sprint_40yd ~ Position_ + Height + Weight + Vertical_Jump + Broad_Jump, data=df1, family=gaussian(link="identity")))
##
## Call:
## glm(formula = Sprint_40yd ~ Position_ + Height + Weight + Vertical_Jump +
## Broad_Jump, family = gaussian(link = "identity"), data = df1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.196903 -0.053317 -0.000342 0.051424 0.243624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5310097 0.1689841 26.813 < 2e-16 ***
## Position_CB 0.0183800 0.0097976 1.876 0.06133 .
## Position_RB 0.0325911 0.0148413 2.196 0.02862 *
## Position_SS 0.0861302 0.0161275 5.341 1.5e-07 ***
## Height 0.0027648 0.0030436 0.908 0.36418
## Weight 0.0010143 0.0004793 2.116 0.03490 *
## Vertical_Jump -0.0045575 0.0017334 -2.629 0.00886 **
## Broad_Jump -0.0024807 0.0009061 -2.738 0.00644 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.006660566)
##
## Null deviance: 3.5812 on 439 degrees of freedom
## Residual deviance: 2.8774 on 432 degrees of freedom
## AIC: -946.49
##
## Number of Fisher Scoring iterations: 2
# find standard deviation of responce(40 yard sprint)
sigma(fitted.model)
## [1] 0.08161229
E_hat(40ydSprint)= 4.531 + 0.0184(CB) + 0.0326(RB) + 0.0861(SS) + 0.1088(Height) + 0.00224(Weight) - 0.00179(Vertical jump) - 0.000977(Broad Jump)
# define null model
null.model = glm(Sprint_40yd ~ 1, data=df1, family=gaussian(link=identity))
#find deviance
print(deviance<- -2*(logLik(null.model)-logLik(fitted.model)))
## 'log Lik.' 96.28655 (df=2)
# find p value of deviance test
p.value<- pchisq(deviance, df=7, lower.tail=FALSE)
p.value
## 'log Lik.' 6.295986e-18 (df=2)
we conclude that the model is a good fit, given by the p-value of the deviance test
No need to run AIC, AICC, and BIC since we are not comparing the model to other models
# 1) lets predict a real players 40 time
# Tyreek Hill, Miami Dolphins
# 40 yard dash time for a WR, weighing 185, 5'10" height, with a vertical jump of 40.5 inches, and broad jump of 129 inches
# actual 40 time: 4.29 seconds
tyreek_pred = predict(fitted.model,data.frame(Position_="WR", Weight=185, Height=70, Vertical_Jump=40.5, Broad_Jump=129))
tyreek_pred
## 1
## 4.40761
# 2) lets predict 40 yard dash time for a RB, weighing 200, 5'11" height, with a vertical jump of 36 inches, and broad jump of 125 inches
pred_1 = predict(fitted.model,data.frame(Position_="WR",
Weight=180, Height=60,
Vertical_Jump=48, Broad_Jump=147))
pred_1
## 1
## 4.296057
# 3)
# Breece Hall. N.Y. Jets
# RB, 217 lbs, 5'11", vertical 40", broad 126"
# actual 40 time: 4.39
Hall_pred = predict(fitted.model,data.frame(Position_="RB", Weight=217, Height=71, Vertical_Jump=40, Broad_Jump=126))
Hall_pred
## 1
## 4.485146
# 4)
# Justin Jefferson. Minnesota Vikings
# WR, 202 lbs, 6'1", vertical 37.5", broad 126"
# actual time: 4.43
Jefferson_pred = predict(fitted.model,data.frame(Position_="WR", Weight=217, Height=71, Vertical_Jump=40, Broad_Jump=126))
Jefferson_pred
## 1
## 4.452555
# 5)
# Patrick Surtain II, Denver Broncos
# CB, 202 lbs, 6'2", vertical 39", broad 131"
# actual time: 4.42 seconds
surtain_pred = predict(fitted.model,data.frame(Position_="CB", Weight=202, Height=72, Vertical_Jump=39, Broad_Jump=131))
surtain_pred
## 1
## 4.450638
# 6)
# Brian Dawkins, Philadelphia Eagles, retired
# SS, height 71.4", weight 189 lbs, vertical 35", broad 120"
# 40 time: 4.63
Dawkins_pred = predict(fitted.model,data.frame(Position_="SS", Weight=189, Height=71.4, Vertical_Jump=35, Broad_Jump=120))
Dawkins_pred
## 1
## 4.549061
ggplot(data=df1, mapping=aes(x=Weight,y=Sprint_40yd, color=Position)) +
geom_point() +
geom_smooth(method=lm, formula=y~x, se=FALSE) +
labs(title="Scatterplot of 40- yard sprint vs weight by position")