Importing data

# importing libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## Warning: package 'stringr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rcompanion)

#importing dataset
df=read_csv("C:/Users/dgray/OneDrive/Stats/stat 410 regression/project/NFL combine data.csv")
## Rows: 3477 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): Player, School, Drafted..tm.rnd.yr., Player_Type, Position_Type, P...
## dbl (11): Year, Age, Height, Weight, Sprint_40yd, Vertical_Jump, Bench_Press...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 3,477
## Columns: 18
## $ Year                <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 20…
## $ Player              <chr> "Beanie Wells\\WellCh00", "Will Davis\\DaviWi99", …
## $ Age                 <dbl> 20, 22, 24, 23, 22, 23, 24, 21, 23, 22, 23, 21, 22…
## $ School              <chr> "Ohio St.", "Illinois", "LSU", "Alabama", "Connect…
## $ Height              <dbl> 1.8542, 1.8796, 2.0066, 1.8034, 1.8796, 1.9304, 1.…
## $ Weight              <dbl> 106.59421, 118.38761, 165.10762, 92.07925, 110.676…
## $ Sprint_40yd         <dbl> 4.38, 4.84, 5.50, 4.49, 4.76, 5.28, 4.98, 5.32, 4.…
## $ Vertical_Jump       <dbl> 85.09, 83.82, NA, 93.98, 92.71, NA, NA, 55.88, 88.…
## $ Bench_Press_Reps    <dbl> 25, 27, 21, 15, 26, 29, NA, 19, 28, 14, 16, 29, 21…
## $ Broad_Jump          <dbl> 325.12, 292.10, NA, 304.80, 304.80, NA, NA, 238.76…
## $ Agility_3cone       <dbl> NA, 7.38, NA, 7.09, 7.10, NA, NA, 7.87, 7.46, 6.93…
## $ Shuttle             <dbl> NA, 4.45, NA, 4.23, 4.40, NA, NA, 4.88, 4.43, 4.16…
## $ Drafted..tm.rnd.yr. <chr> "Arizona Cardinals / 1st / 31st pick / 2009", "Ari…
## $ BMI                 <dbl> 31.00419, 33.51007, 41.00582, 28.31246, 31.32742, …
## $ Player_Type         <chr> "offense", "defense", "offense", "defense", "defen…
## $ Position_Type       <chr> "backs_receivers", "defensive_lineman", "offensive…
## $ Position            <chr> "RB", "DE", "OG", "FS", "OLB", "OG", "DT", "OT", "…
## $ Drafted             <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
# showing all position values
unique(df$Position)
##  [1] "RB"  "DE"  "OG"  "FS"  "OLB" "DT"  "OT"  "CB"  "SS"  "TE"  "ILB" "C"  
## [13] "FB"  "WR"  "P"   "K"   "QB"  "LS"  "S"   "DB"
# subsetting to include only Rb, WR, CB, and SS position groups
df1 = df %>% 
  filter(Position=="WR" | Position=="CB" | Position=="SS" | Position=="RB")
glimpse(df1)
## Rows: 1,327
## Columns: 18
## $ Year                <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 20…
## $ Player              <chr> "Beanie Wells\\WellCh00", "Chris Owens\\OwenCh99",…
## $ Age                 <dbl> 20, 22, 23, 23, 22, 22, 21, 21, 20, 24, 22, 21, 21…
## $ School              <chr> "Ohio St.", "San Jose St.", "Missouri", "Nicholls …
## $ Height              <dbl> 1.8542, 1.7780, 1.8288, 1.7780, 1.7526, 1.7780, 1.…
## $ Weight              <dbl> 106.59421, 82.10022, 100.24391, 81.19303, 97.97595…
## $ Sprint_40yd         <dbl> 4.38, 4.44, 4.49, 4.35, 4.34, 4.68, 4.66, 4.43, 4.…
## $ Vertical_Jump       <dbl> 85.09, 87.63, 93.98, 92.71, 101.60, NA, NA, 100.33…
## $ Bench_Press_Reps    <dbl> 25, 14, 16, 15, 27, NA, 18, 14, 19, 12, NA, 17, NA…
## $ Broad_Jump          <dbl> 325.12, 279.40, 312.42, 307.34, 297.18, NA, NA, 29…
## $ Agility_3cone       <dbl> NA, 6.93, 6.81, 6.77, 6.99, NA, NA, 6.89, 7.15, 6.…
## $ Shuttle             <dbl> NA, 4.16, 4.26, 4.10, 4.29, NA, NA, 4.22, 4.20, 3.…
## $ Drafted..tm.rnd.yr. <chr> "Arizona Cardinals / 1st / 31st pick / 2009", "Atl…
## $ BMI                 <dbl> 31.00419, 25.97053, 29.97268, 25.68356, 31.89730, …
## $ Player_Type         <chr> "offense", "defense", "defense", "defense", "offen…
## $ Position_Type       <chr> "backs_receivers", "defensive_back", "defensive_ba…
## $ Position            <chr> "RB", "CB", "SS", "CB", "RB", "CB", "CB", "RB", "C…
## $ Drafted             <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
# converting from metric units to american units
df1 = df1 %>% 
  mutate(Weight= 2.20462*(Weight),
         Height= 39.3701*(Height), 
         Vertical_Jump = 0.393701*(Vertical_Jump),
         Broad_Jump= 0.393701*(Broad_Jump))
df1
## # A tibble: 1,327 × 18
##     Year Player         Age School Height Weight Sprin…¹ Verti…² Bench…³ Broad…⁴
##    <dbl> <chr>        <dbl> <chr>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1  2009 "Beanie Wel…    20 Ohio …   73.0   235.    4.38    33.5      25    128.
##  2  2009 "Chris Owen…    22 San J…   70.0   181.    4.44    34.5      14    110.
##  3  2009 "William Mo…    23 Misso…   72.0   221.    4.49    37.0      16    123.
##  4  2009 "Lardarius …    23 Nicho…   70.0   179.    4.35    36.5      15    121.
##  5  2009 "Cedric Pee…    22 Virgi…   69.0   216.    4.34    40.0      27    117.
##  6  2009 "Jairus Byr…    22 Oregon   70.0   207.    4.68    NA        NA     NA 
##  7  2009 "Cary Harri…    21 USC      71.0   187.    4.66    NA        18     NA 
##  8  2009 "Mike Goods…    21 Texas…   72.0   208.    4.43    39.5      14    118.
##  9  2009 "Captain Mu…    20 South…   68.0   182.    4.41    37.5      19    121.
## 10  2009 "Sherrod Ma…    24 Troy     73.0   198.    4.43    36.0      12    123.
## # … with 1,317 more rows, 8 more variables: Agility_3cone <dbl>, Shuttle <dbl>,
## #   Drafted..tm.rnd.yr. <chr>, BMI <dbl>, Player_Type <chr>,
## #   Position_Type <chr>, Position <chr>, Drafted <chr>, and abbreviated
## #   variable names ¹​Sprint_40yd, ²​Vertical_Jump, ³​Bench_Press_Reps, ⁴​Broad_Jump
# droppin null values 
df1=drop_na(df1)

Showing the response variable is normally distributed

# histogram of response variable
plotNormalHistogram(df1$Sprint_40yd)

The above histogram resembles a bell-shaped curve, which signifies that it is normally distributed

# testing normality of response via shapiro test of 40 yd dash times
shapiro.test(df1$Sprint_40yd)
## 
##  Shapiro-Wilk normality test
## 
## data:  df1$Sprint_40yd
## W = 0.99616, p-value = 0.3671
# testing normality of response via histogram of 40 yd dash times
ggplot(df1, aes(x=Sprint_40yd)) +
  geom_histogram(bins=18, color="black", fill="orange") 

shapiro test gives us a p-value of 0.3671, which is not in our rejection range. So we can conclude that the response variable is normally distributed.

# setting reference levels 
Position_ = relevel(as.factor(df1$Position), ref="WR")

Fitting General Regression model

# fitting general linear regreession 
summary(fitted.model<-glm(Sprint_40yd ~ Position_ + Height + Weight + Vertical_Jump + Broad_Jump, data=df1, family=gaussian(link="identity")))
## 
## Call:
## glm(formula = Sprint_40yd ~ Position_ + Height + Weight + Vertical_Jump + 
##     Broad_Jump, family = gaussian(link = "identity"), data = df1)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.196903  -0.053317  -0.000342   0.051424   0.243624  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.5310097  0.1689841  26.813  < 2e-16 ***
## Position_CB    0.0183800  0.0097976   1.876  0.06133 .  
## Position_RB    0.0325911  0.0148413   2.196  0.02862 *  
## Position_SS    0.0861302  0.0161275   5.341  1.5e-07 ***
## Height         0.0027648  0.0030436   0.908  0.36418    
## Weight         0.0010143  0.0004793   2.116  0.03490 *  
## Vertical_Jump -0.0045575  0.0017334  -2.629  0.00886 ** 
## Broad_Jump    -0.0024807  0.0009061  -2.738  0.00644 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.006660566)
## 
##     Null deviance: 3.5812  on 439  degrees of freedom
## Residual deviance: 2.8774  on 432  degrees of freedom
## AIC: -946.49
## 
## Number of Fisher Scoring iterations: 2
# find standard deviation of responce(40 yard sprint)
sigma(fitted.model)
## [1] 0.08161229

Fitted model

E_hat(40ydSprint)= 4.531 + 0.0184(CB) + 0.0326(RB) + 0.0861(SS) + 0.1088(Height) + 0.00224(Weight) - 0.00179(Vertical jump) - 0.000977(Broad Jump)

Testing Model Fit

# define null model
null.model = glm(Sprint_40yd ~ 1, data=df1, family=gaussian(link=identity))

#find deviance
print(deviance<- -2*(logLik(null.model)-logLik(fitted.model)))
## 'log Lik.' 96.28655 (df=2)
# find p value of deviance test
p.value<- pchisq(deviance, df=7, lower.tail=FALSE)
p.value
## 'log Lik.' 6.295986e-18 (df=2)

we conclude that the model is a good fit, given by the p-value of the deviance test

No need to run AIC, AICC, and BIC since we are not comparing the model to other models

Predictions

# 1) lets predict a real players 40 time
# Tyreek Hill, Miami Dolphins
# 40 yard dash time for a WR, weighing 185, 5'10" height, with a vertical jump of 40.5 inches, and broad jump of 129 inches
# actual 40 time: 4.29 seconds
tyreek_pred = predict(fitted.model,data.frame(Position_="WR", Weight=185, Height=70, Vertical_Jump=40.5, Broad_Jump=129))
tyreek_pred
##       1 
## 4.40761
# 2) lets predict 40 yard dash time for a RB, weighing 200, 5'11" height, with a vertical jump of 36 inches, and broad jump of 125 inches
pred_1 = predict(fitted.model,data.frame(Position_="WR", 
                                         Weight=180, Height=60,
                                         Vertical_Jump=48, Broad_Jump=147))
pred_1
##        1 
## 4.296057
# 3)
# Breece Hall. N.Y. Jets
# RB, 217 lbs, 5'11", vertical 40", broad 126"
# actual 40 time: 4.39
Hall_pred = predict(fitted.model,data.frame(Position_="RB", Weight=217, Height=71, Vertical_Jump=40, Broad_Jump=126))
Hall_pred
##        1 
## 4.485146
# 4)
# Justin Jefferson. Minnesota Vikings
# WR, 202 lbs, 6'1", vertical 37.5", broad 126"
# actual time: 4.43
Jefferson_pred = predict(fitted.model,data.frame(Position_="WR", Weight=217, Height=71, Vertical_Jump=40, Broad_Jump=126))
Jefferson_pred
##        1 
## 4.452555
# 5)
# Patrick Surtain II, Denver Broncos
# CB, 202 lbs, 6'2", vertical 39", broad 131"
# actual time: 4.42 seconds
surtain_pred = predict(fitted.model,data.frame(Position_="CB", Weight=202, Height=72, Vertical_Jump=39, Broad_Jump=131))
surtain_pred
##        1 
## 4.450638
# 6)
# Brian Dawkins, Philadelphia Eagles, retired
# SS, height 71.4", weight 189 lbs, vertical 35", broad 120"
# 40 time: 4.63
Dawkins_pred = predict(fitted.model,data.frame(Position_="SS", Weight=189, Height=71.4, Vertical_Jump=35, Broad_Jump=120))
Dawkins_pred
##        1 
## 4.549061

Extra

ggplot(data=df1, mapping=aes(x=Weight,y=Sprint_40yd, color=Position)) +
  geom_point() +
  geom_smooth(method=lm, formula=y~x, se=FALSE) + 
  labs(title="Scatterplot of 40- yard sprint vs weight by position")