All flights on this day x years ago

Be careful with the equality operator ==:

flights %>% 
  filter(month = 6, day = 2)
## Error: `month` (`month = 6`), `day` (`day = 2`) must not be named, do you need `==`?
flights %>% 
  filter(month == 6, day == 2)
## # A tibble: 911 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     6     2       14           2359        15      339
##  2  2013     6     2       20           2155       145      222
##  3  2013     6     2       24           2245        99      133
##  4  2013     6     2       33           2059       214      150
##  5  2013     6     2       35           2130       185      332
##  6  2013     6     2       36           1914       322      223
##  7  2013     6     2       44           2359        45      420
##  8  2013     6     2      128           2159       209      325
##  9  2013     6     2      131           2146       225      229
## 10  2013     6     2      219           2055       324      322
## # ... with 901 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Can we make a dynamic query?

flights %>% 
  filter(
    month == lubridate::month(Sys.Date()),
    day == lubridate::day(Sys.Date())
  )
## # A tibble: 857 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11    30       11           2359        12      457
##  2  2013    11    30      457            500        -3      636
##  3  2013    11    30      514            515        -1      753
##  4  2013    11    30      544            540         4      828
##  5  2013    11    30      545            550        -5     1026
##  6  2013    11    30      549            545         4      831
##  7  2013    11    30      554            600        -6      814
##  8  2013    11    30      554            600        -6      943
##  9  2013    11    30      555            600        -5      737
## 10  2013    11    30      555            600        -5      840
## # ... with 847 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Can we use arguments?

month_ <- lubridate::month(Sys.Date())
day_ <- lubridate::day(Sys.Date())
flights %>% 
  filter(
    month == month_,
    day == day_
  )
## # A tibble: 857 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11    30       11           2359        12      457
##  2  2013    11    30      457            500        -3      636
##  3  2013    11    30      514            515        -1      753
##  4  2013    11    30      544            540         4      828
##  5  2013    11    30      545            550        -5     1026
##  6  2013    11    30      549            545         4      831
##  7  2013    11    30      554            600        -6      814
##  8  2013    11    30      554            600        -6      943
##  9  2013    11    30      555            600        -5      737
## 10  2013    11    30      555            600        -5      840
## # ... with 847 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

All flights between 8:00 AM and 10:00 PM

flights %>% 
  filter(dep_time >= 800, dep_time <= 2200)
## # A tibble: 267,608 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      800            800         0     1022
##  2  2013     1     1      800            810       -10      949
##  3  2013     1     1      801            805        -4      900
##  4  2013     1     1      803            810        -7      903
##  5  2013     1     1      803            800         3     1132
##  6  2013     1     1      804            810        -6     1103
##  7  2013     1     1      805            805         0     1015
##  8  2013     1     1      805            800         5     1118
##  9  2013     1     1      805            815       -10     1006
## 10  2013     1     1      807            810        -3     1043
## # ... with 267,598 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>% 
  filter(between(dep_time, 800, 2200))
## # A tibble: 267,608 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      800            800         0     1022
##  2  2013     1     1      800            810       -10      949
##  3  2013     1     1      801            805        -4      900
##  4  2013     1     1      803            810        -7      903
##  5  2013     1     1      803            800         3     1132
##  6  2013     1     1      804            810        -6     1103
##  7  2013     1     1      805            805         0     1015
##  8  2013     1     1      805            800         5     1118
##  9  2013     1     1      805            815       -10     1006
## 10  2013     1     1      807            810        -3     1043
## # ... with 267,598 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Flights in winter months

flights %>%
  filter(month %in% c(12, 1, 2))
## # A tibble: 80,090 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 80,080 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
winter_months <- c(12, 1, 2)
flights %>%
  filter(month %in% winter_months)
## # A tibble: 80,090 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 80,080 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Departure time later than arrival time

flights %>% 
  filter(dep_time > arr_time)
## # A tibble: 10,633 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     1929           1920         9        3
##  2  2013     1     1     1939           1840        59       29
##  3  2013     1     1     2058           2100        -2        8
##  4  2013     1     1     2102           2108        -6      146
##  5  2013     1     1     2108           2057        11       25
##  6  2013     1     1     2120           2130       -10       16
##  7  2013     1     1     2121           2040        41        6
##  8  2013     1     1     2128           2135        -7       26
##  9  2013     1     1     2134           2045        49       20
## 10  2013     1     1     2136           2145        -9       25
## # ... with 10,623 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Flight with shortest airtime

flights %>% 
  arrange(air_time) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1    16     1355           1315        40     1442
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Flight with heaviest delay

flights %>% 
  arrange(arr_delay) %>%
  tail(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     9    30       NA            840        NA       NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Why doesn’t this give the result we’re looking for? Can we use a filter?

flights %>% 
  filter(!is.na(arr_delay)) %>%
  arrange(arr_delay) %>%
  tail(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Or the pattern below?

flights %>% 
  arrange(!is.na(arr_delay), arr_delay) %>%
  tail(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Usually it’s easiest to sort in descending order:

flights %>% 
  arrange(-arr_delay) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
flights %>% 
  arrange(desc(arr_delay)) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Flight with longest airtime

flights %>% 
  arrange(desc(air_time)) %>%
  head(1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     3    17     1337           1335         2     1937
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

UA flights with lowest delay

If we filter first, fewer observations need to be sorted.

flights %>% 
  filter(carrier == "UA") %>%
  arrange(arr_delay)
## # A tibble: 58,665 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     5     2     1947           1949        -2     2209
##  2  2013     5     2     1926           1929        -3     2157
##  3  2013     5     7     2054           2055        -1     2317
##  4  2013     2    26     1335           1335         0     1819
##  5  2013     2    26     1721           1725        -4     1936
##  6  2013     2    28      702            705        -3      924
##  7  2013     5    13     1624           1629        -5     1831
##  8  2013     5     4     1914           1915        -1     2107
##  9  2013    12    27      853            856        -3     1052
## 10  2013     3     1      629            632        -3      844
## # ... with 58,655 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>% 
  arrange(arr_delay) %>%
  filter(carrier == "UA")
## # A tibble: 58,665 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     5     2     1947           1949        -2     2209
##  2  2013     5     2     1926           1929        -3     2157
##  3  2013     5     7     2054           2055        -1     2317
##  4  2013     2    26     1335           1335         0     1819
##  5  2013     2    26     1721           1725        -4     1936
##  6  2013     2    28      702            705        -3      924
##  7  2013     5    13     1624           1629        -5     1831
##  8  2013     5     4     1914           1915        -1     2107
##  9  2013    12    27      853            856        -3     1052
## 10  2013     3     1      629            632        -3      844
## # ... with 58,655 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Recovering delay

flights %>% 
  arrange(dep_delay - arr_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1      658            700        -2     1329
##  2  2013     4    18      558            600        -2     1149
##  3  2013     8     8     1819           1519       180        5
##  4  2013     7    10     1916           1900        16      137
##  5  2013     6    27     1608           1525        43     2045
##  6  2013     7    22     1606           1615        -9     2056
##  7  2013     7     1      811            800        11     1344
##  8  2013     7    10     2011           1520       291     2357
##  9  2013     7    22     1626           1545        41     2051
## 10  2013     4    18      655            700        -5     1213
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.