Putting the Fun in Functional Data:
A tidy pipeline to identify routes in NFL tracking data
Dani Chu (@chuurveg) Quantitative Analyst - Statistics, NHL Seattle
1
Collaborators
2
A little about me
3
NFL Big Data Bowl
4
The Data
5
Example - KC vs TEN, 2020 AFC Championship
6
Credit: NFL Next Gen Stats
Route Identification
7
Receivers with most routes per team
8
Route Clustering
9
Cluster 20 - Post
10
Route Development
11
What I hope you learn
12
Our Data
13
The Tracking Data
14
Read in our Data
15
tracking_data
# A tibble: 316,025 x 14
time x y s dis dir event nflId displayName jerseyNumber team frame.id gameId playId
<dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 2017-09-08 00:41:59 41.6 16.5 3.91 0.41 78.9 NA 2495340 Anthony Sherman 42 away 1 2017090700 44
2 2017-09-08 00:41:59 42.0 16.6 4.28 0.4 79.2 NA 2495340 Anthony Sherman 42 away 2 2017090700 44
3 2017-09-08 00:41:59 42.4 16.7 4.66 0.47 79.5 NA 2495340 Anthony Sherman 42 away 3 2017090700 44
4 2017-09-08 00:41:59 42.8 16.8 5.04 0.46 79.8 NA 2495340 Anthony Sherman 42 away 4 2017090700 44
5 2017-09-08 00:41:59 43.4 16.9 5.39 0.51 80.1 kickoff 2495340 Anthony Sherman 42 away 5 2017090700 44
6 2017-09-08 00:41:59 43.9 17.0 5.6 0.52 80.6 NA 2495340 Anthony Sherman 42 away 6 2017090700 44
7 2017-09-08 00:41:59 44.4 17.1 5.78 0.580 81.0 NA 2495340 Anthony Sherman 42 away 7 2017090700 44
8 2017-09-08 00:41:59 45.1 17.2 5.97 0.63 81.1 NA 2495340 Anthony Sherman 42 away 8 2017090700 44
9 2017-09-08 00:41:59 45.7 17.3 6.18 0.64 81.2 NA 2495340 Anthony Sherman 42 away 9 2017090700 44
# ... with 316,016 more rows
tracking_data <- readr::read_csv(file_name, col_types = cols())
Cut! Cut! Cut!
16
Columns First!
17
> # A tibble: 316,025 x 8
nflId gameId playId x y frame.id team event
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 2495340 2017090700 44 41.6 16.5 1 away NA
2 2495340 2017090700 44 42.0 16.6 2 away NA
3 2495340 2017090700 44 42.4 16.7 3 away NA
4 2495340 2017090700 44 42.8 16.8 4 away NA
5 2495340 2017090700 44 43.4 16.9 5 away kickoff
6 2495340 2017090700 44 43.9 17.0 6 away NA
7 2495340 2017090700 44 44.4 17.1 7 away NA
# ... with 316,018 more rows
tracking_data %>%
dplyr::select(nflId, gameId, playId, x, y, frame.id, team, event)
And the Rows!
18
# A tibble: 32,282 x 9
nflId gameId playId x y frame.id team event PositionAbbr
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 497240 2017090700 68 83.9 34.8 1 home NA TE
2 497240 2017090700 68 83.9 34.8 2 home NA TE
3 497240 2017090700 68 83.9 34.8 3 home NA TE
# ... with 32,271 more rows
... %>%
# keep only the passing plays
dplyr::inner_join(pass_playIds, by = c("gameId", "playId"))
# keep only the tracking data for route runners
dplyr::inner_join(route_runners_pos_id_key, by = c("nflId"))
Tidy Data
19
Tidy Data
20
Credit: R for Data Science
21
No!
22
Curves are the Observations!
23
Tidy with tidyr
24
# A tibble: 394 x 6
nflId gameId playId team PositionAbbr tracking_data
<dbl> <dbl> <dbl> <chr> <chr> <list<df[,4]>>
1 497240 2017090700 68 home TE [87 x 4]
2 2530515 2017090700 68 home WR [87 x 4]
3 2533046 2017090700 68 home TE [87 x 4]
4 2539265 2017090700 68 home RB [87 x 4]
5 2543498 2017090700 68 home WR [87 x 4]
6 2649 2017090700 94 home WR [89 x 4]
7 497240 2017090700 94 home TE [89 x 4]
# ... with 387 more rows
... %>%
# nest the x, y data tidyness
tidyr::nest(tracking_data = c(x, y, frame.id, event))
Now do it for all the files!
25
Map to All Files
26
routes_data <-
# list of all the files
tracking_files %>%
# read in each file and create a data frame of routes
purrr::map_dfr(read_routes_from_csv)
Routes Data Frame
27
> routes_data
# A tibble: 34,698 x 9
nflId gameId playId team quarter PositionAbbr tracking_data direction_left line_of_scrimmage
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <list> <chr> <dbl>
1 497240 2017090700 68 home 1 TE <tibble [87 x 5]> home 83.7
2 2530515 2017090700 68 home 1 WR <tibble [87 x 5]> home 83.7
3 2533046 2017090700 68 home 1 TE <tibble [87 x 5]> home 83.7
4 2539265 2017090700 68 home 1 RB <tibble [87 x 5]> home 83.7
5 2543498 2017090700 68 home 1 WR <tibble [87 x 5]> home 83.7
6 2649 2017090700 94 home 1 WR <tibble [89 x 5]> home 83.4
7 497240 2017090700 94 home 1 TE <tibble [89 x 5]> home 83.4
8 2530515 2017090700 94 home 1 WR <tibble [89 x 5]> home 83.4
9 2539265 2017090700 94 home 1 RB <tibble [89 x 5]> home 83.4
# ... with 34,689 more rows
purrr::map()
28
Function Development then Map!
29
routes_data <-
routes_data %>%
dplyr::mutate(tracking_data = purrr::map(tracking_data, cut_plays),
tracking_data = purrr::pmap(list(tracking_data, team, direction_left, line_of_scrimmage),
flip_length(..1, ..2, ..3, ..4)),
tracking_data = purrr::map(tracking_data, flip_width))
Finally Cluster!
30
cluster_trajectory_data(routes_data$tracking_data)
Tips for development
31
Work with One “Observation”
32
# A tibble: 87 x 12
nflId gameId playId team quarter PositionAbbr x y frame.id event direction_left line_of_scrimmage
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
1 497240 2017090700 68 home 1 TE 83.9 34.8 1 NA home 83.7
2 497240 2017090700 68 home 1 TE 83.9 34.8 2 NA home 83.7
3 497240 2017090700 68 home 1 TE 83.9 34.8 3 NA home 83.7
4 497240 2017090700 68 home 1 TE 83.9 34.8 4 NA home 83.7
5 497240 2017090700 68 home 1 TE 83.9 34.8 5 NA home 83.7
6 497240 2017090700 68 home 1 TE 83.9 34.8 6 NA home 83.7
# ... with 81 more rows
routes_data %>%
dplyr::slice(1) %>%
tidyr::unnest(cols = tracking_data)
Use purrr::safely()
33
routes_data %>%
dplyr::sample_n(5) %>%
dplyr::mutate(tracking_data = purrr::map(tracking_data, cut_plays_rand_err))
Error in cutt_plays(data) : could not find function "cutt_plays"
Use purrr::safely()
34
routes_data %>%
dplyr::sample_n(5) %>%
dplyr::mutate(tracking_data = purrr::map(tracking_data,
purrr::safely(cut_plays_rand_err,
otherwise = NA)),
Use purrr::safely()
35
routes_data %>%
dplyr::sample_n(5) %>%
dplyr::mutate(tracking_data = purrr::map(tracking_data,
purrr::safely(cut_plays_rand_err,
otherwise = NA)),
error = purrr::map(tracking_data, ~ purrr::pluck(., "error")),
tracking_data = purrr::map(tracking_data, ~ purrr::pluck(., "result")))
Use purrr::safely()
36
# A tibble: 5 x 10
nflId gameId playId team quarter PositionAbbr tracking_data direction_left line_of_scrimmage error
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <list> <chr> <dbl> <list>
1 2558069 2017091000 2314 away 3 WR <lgl [1]> home 48.1 <smplErrr>
2 2543488 2017100100 1405 home 2 WR <tibble [41 x 4]> away 57 <NULL>
3 71265 2017092413 595 away 1 TE <lgl [1]> home 34.7 <smplErrr>
4 2555224 2017091012 3213 home 4 RB <lgl [1]> away 67.1 <smplErrr>
5 2555415 2017100101 3945 home 4 TE <tibble [32 x 4]> away 77.2 <NULL>
Use purrr::safely()
37
routes_data %>%
dplyr::sample_n(5) %>%
dplyr::mutate(tracking_data = purrr::map(tracking_data,
purrr::safely(cut_plays_rand_err,
otherwise = NA)),
error = purrr::map(tracking_data, ~ purrr::pluck(., "error")),
tracking_data = purrr::map(tracking_data, ~ purrr::pluck(., "result")),
type_error = purrr::map_chr(error, typeof)) %>%
dplyr::filter(type_error != "NULL")
Use purrr::safely()
38
# A tibble: 3 x 10
nflId gameId playId team quarter PositionAbbr tracking_data direction_left line_of_scrimmage error
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <list> <chr> <dbl> <list>
1 2558069 2017091000 2314 away 3 WR <lgl [1]> home 48.1 <smplErrr>
2 71265 2017092413 595 away 1 TE <lgl [1]> home 34.7 <smplErrr>
3 2555224 2017091012 3213 home 4 RB <lgl [1]> away 67.1 <smplErrr>
Use purrr::safely()
39
routes_data %>%
dplyr::sample_n(5) %>%
dplyr::mutate(tracking_data = purrr::map(tracking_data,
purrr::safely(cut_plays_rand_err,
otherwise = NA)),
error = purrr::map(tracking_data, ~ purrr::pluck(., "error")),
tracking_data = purrr::map(tracking_data, ~ purrr::pluck(., "result")),
type_error = purrr::map_chr(error, typeof)) %>%
dplyr::filter(type_error != "NULL") %>%
dplyr::pull(error)
Use purrr::safely()
40
[[1]]
<simpleError in cutt_plays(data): could not find function "cutt_plays">
[[2]]
<simpleError in cutt_plays(data): could not find function "cutt_plays">
[[3]]
<simpleError in cutt_plays(data): could not find function "cutt_plays">
Takeaways!
41
Thank You!
Any Questions?
Paper in Journal of Quantitative Analysis in Sports
Slides on twitter @chuurveg or on my website danichusfu.github.io
Email:
Twitter:
Email:
Twitter:
42
43