Flight with shortest airtime

flights %>% 
  arrange(air_time) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1    16     1355           1315        40     1442
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Flight with heaviest delay

flights %>% 
  arrange(arr_delay) %>%
  tail(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     9    30       NA            840        NA       NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Why doesn’t this give the result we’re looking for? Can we use a filter?

flights %>% 
  filter(!is.na(arr_delay)) %>%
  arrange(arr_delay) %>%
  tail(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Or the pattern below?

flights %>% 
  arrange(!is.na(arr_delay), arr_delay) %>%
  tail(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Usually it’s easiest to sort in descending order:

flights %>% 
  arrange(-arr_delay) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
flights %>% 
  arrange(desc(arr_delay)) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Flight with longest airtime

flights %>% 
  arrange(desc(air_time)) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     3    17     1337           1335         2     1937
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

UA flights with lowest delay

If we filter first, fewer observations need to be sorted.

flights %>% 
  filter(carrier == "UA") %>%
  arrange(arr_delay)
## # A tibble: 58,665 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     5     2     1947           1949        -2     2209
##  2  2013     5     2     1926           1929        -3     2157
##  3  2013     5     7     2054           2055        -1     2317
##  4  2013     2    26     1335           1335         0     1819
##  5  2013     2    26     1721           1725        -4     1936
##  6  2013     2    28      702            705        -3      924
##  7  2013     5    13     1624           1629        -5     1831
##  8  2013     5     4     1914           1915        -1     2107
##  9  2013    12    27      853            856        -3     1052
## 10  2013     3     1      629            632        -3      844
## # ... with 58,655 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>% 
  arrange(arr_delay) %>%
  filter(carrier == "UA")
## # A tibble: 58,665 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     5     2     1947           1949        -2     2209
##  2  2013     5     2     1926           1929        -3     2157
##  3  2013     5     7     2054           2055        -1     2317
##  4  2013     2    26     1335           1335         0     1819
##  5  2013     2    26     1721           1725        -4     1936
##  6  2013     2    28      702            705        -3      924
##  7  2013     5    13     1624           1629        -5     1831
##  8  2013     5     4     1914           1915        -1     2107
##  9  2013    12    27      853            856        -3     1052
## 10  2013     3     1      629            632        -3      844
## # ... with 58,655 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Recovering delay

flights %>% 
  arrange(dep_delay - arr_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1      658            700        -2     1329
##  2  2013     4    18      558            600        -2     1149
##  3  2013     8     8     1819           1519       180        5
##  4  2013     7    10     1916           1900        16      137
##  5  2013     6    27     1608           1525        43     2045
##  6  2013     7    22     1606           1615        -9     2056
##  7  2013     7     1      811            800        11     1344
##  8  2013     7    10     2011           1520       291     2357
##  9  2013     7    22     1626           1545        41     2051
## 10  2013     4    18      655            700        -5     1213
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.