Speed as miles per hour

flights %>%
  mutate(miles_per_hour = distance / air_time * 60)
## # A tibble: 336,776 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, miles_per_hour <dbl>
flights %>%
  mutate(miles_per_minute = distance / air_time) %>% 
  mutate(miles_per_hour = miles_per_minute * 60) %>% 
  select(-miles_per_minute)
## # A tibble: 336,776 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, miles_per_hour <dbl>
flights %>%
  mutate(miles_per_hour = distance / air_time * 60) %>%
  ggplot() +
  geom_histogram(
    aes(miles_per_hour),
    na.rm = TRUE,
    binwidth = 20
  )

flights %>% 
  ggplot() +
  geom_histogram(aes(distance / air_time))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9430 rows containing non-finite values (stat_bin).

On time status

flights %>%
  mutate(
    on_time = (arr_delay <= 0),
    on_time_desc = if_else(on_time, "On time", "Delayed")
  )
## # A tibble: 336,776 x 21
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, on_time <lgl>, on_time_desc <chr>
flights %>%
  mutate(
    on_time = (arr_delay <= 0),
    on_time_desc = if_else(on_time, "On time", "Delayed")
  ) %>%
  ggplot(aes(x = carrier, fill = on_time_desc)) +
  geom_bar()

Speed distributions

speed_and_on_time_info <-
  flights %>%
  mutate(
    miles_per_minute = distance / air_time,
    miles_per_hour = miles_per_minute * 60
  ) %>% 
  select(-miles_per_minute) %>% 
  mutate(
    on_time = (arr_delay <= 0),
    on_time_desc = if_else(on_time, "On time", "Delayed")
  ) %>% 
  select(-on_time)

speed_and_on_time_info %>% 
  ggplot() +
  geom_freqpoly(
    aes(x = miles_per_hour, y = ..density.., color = on_time_desc),
    na.rm = TRUE,
    binwidth = 20
  )

speed_and_on_time_info %>% 
  filter(!is.na(on_time_desc)) %>% 
  ggplot() +
  geom_histogram(
    aes(x = miles_per_hour),
    na.rm = TRUE,
    binwidth = 20
  ) +
  facet_wrap(~on_time_desc, ncol = 1)

Date

flights %>%
  mutate(
    date_hour = as.Date(time_hour, tz = "EST"),
    date_ymd = lubridate::make_date(year, month, day)
  ) %>% 
  filter(date_hour != date_ymd)
## # A tibble: 0 x 21
## # ... with 21 variables: year <int>, month <int>, day <int>,
## #   dep_time <int>, sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   date_hour <date>, date_ymd <date>

Deviation from average departure delay

flights %>% 
  mutate(dev = dep_delay - mean(dep_delay, na.rm = TRUE)) %>%
  ggplot() +
  geom_violin(aes(x = origin, y = dev), na.rm = TRUE)

Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.