Flights on the same date as today

flights %>%
  filter(day == 13, month == 11) %>%
  filter(carrier == "US")
## # A tibble: 66 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11    13      458            500        -2      632
##  2  2013    11    13      551            600        -9      638
##  3  2013    11    13      553            600        -7      650
##  4  2013    11    13      606            615        -9      741
##  5  2013    11    13      607            615        -8      758
##  6  2013    11    13      622            630        -8      818
##  7  2013    11    13      625            630        -5      824
##  8  2013    11    13      626            630        -4      915
##  9  2013    11    13      631            600        31      739
## 10  2013    11    13      653            700        -7      753
## # ... with 56 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>%
  filter(day == 13 & month == 11 & carrier == "US")
## # A tibble: 66 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11    13      458            500        -2      632
##  2  2013    11    13      551            600        -9      638
##  3  2013    11    13      553            600        -7      650
##  4  2013    11    13      606            615        -9      741
##  5  2013    11    13      607            615        -8      758
##  6  2013    11    13      622            630        -8      818
##  7  2013    11    13      625            630        -5      824
##  8  2013    11    13      626            630        -4      915
##  9  2013    11    13      631            600        31      739
## 10  2013    11    13      653            700        -7      753
## # ... with 56 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Red eye flights

flights %>%
  filter(dep_time < 600 | dep_time > 2200)
## # A tibble: 16,753 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 16,743 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Flights not by UA or WN

flights %>%
  filter(carrier != "UA" & carrier != "WN")
## # A tibble: 265,836 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      542            540         2      923
##  2  2013     1     1      544            545        -1     1004
##  3  2013     1     1      554            600        -6      812
##  4  2013     1     1      555            600        -5      913
##  5  2013     1     1      557            600        -3      709
##  6  2013     1     1      557            600        -3      838
##  7  2013     1     1      558            600        -2      753
##  8  2013     1     1      558            600        -2      849
##  9  2013     1     1      558            600        -2      853
## 10  2013     1     1      559            600        -1      941
## # ... with 265,826 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>%
  filter(!(carrier == "UA" | carrier == "WN"))
## # A tibble: 265,836 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      542            540         2      923
##  2  2013     1     1      544            545        -1     1004
##  3  2013     1     1      554            600        -6      812
##  4  2013     1     1      555            600        -5      913
##  5  2013     1     1      557            600        -3      709
##  6  2013     1     1      557            600        -3      838
##  7  2013     1     1      558            600        -2      753
##  8  2013     1     1      558            600        -2      849
##  9  2013     1     1      558            600        -2      853
## 10  2013     1     1      559            600        -1      941
## # ... with 265,826 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>%
  filter(!(carrier %in% c("UA", "WN")))
## # A tibble: 265,836 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      542            540         2      923
##  2  2013     1     1      544            545        -1     1004
##  3  2013     1     1      554            600        -6      812
##  4  2013     1     1      555            600        -5      913
##  5  2013     1     1      557            600        -3      709
##  6  2013     1     1      557            600        -3      838
##  7  2013     1     1      558            600        -2      753
##  8  2013     1     1      558            600        -2      849
##  9  2013     1     1      558            600        -2      853
## 10  2013     1     1      559            600        -1      941
## # ... with 265,826 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Missing data

flights %>%
  filter(is.na(arr_time))
## # A tibble: 8,713 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     2016           1930        46       NA
##  2  2013     1     1       NA           1630        NA       NA
##  3  2013     1     1       NA           1935        NA       NA
##  4  2013     1     1       NA           1500        NA       NA
##  5  2013     1     1       NA            600        NA       NA
##  6  2013     1     2     2041           2045        -4       NA
##  7  2013     1     2     2145           2129        16       NA
##  8  2013     1     2       NA           1540        NA       NA
##  9  2013     1     2       NA           1620        NA       NA
## 10  2013     1     2       NA           1355        NA       NA
## # ... with 8,703 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>%
  filter(is.na(arr_time) & is.na(dep_time))
## # A tibble: 8,255 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       NA           1630        NA       NA
##  2  2013     1     1       NA           1935        NA       NA
##  3  2013     1     1       NA           1500        NA       NA
##  4  2013     1     1       NA            600        NA       NA
##  5  2013     1     2       NA           1540        NA       NA
##  6  2013     1     2       NA           1620        NA       NA
##  7  2013     1     2       NA           1355        NA       NA
##  8  2013     1     2       NA           1420        NA       NA
##  9  2013     1     2       NA           1321        NA       NA
## 10  2013     1     2       NA           1545        NA       NA
## # ... with 8,245 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>%
  filter(is.na(arr_time) & !is.na(dep_time))
## # A tibble: 458 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     2016           1930        46       NA
##  2  2013     1     2     2041           2045        -4       NA
##  3  2013     1     2     2145           2129        16       NA
##  4  2013     1     9      615            615         0       NA
##  5  2013     1     9     2042           2040         2       NA
##  6  2013     1    11     1344           1350        -6       NA
##  7  2013     1    13     1907           1634       153       NA
##  8  2013     1    13     2239           2159        40       NA
##  9  2013     1    16      837            840        -3       NA
## 10  2013     1    25     1452           1500        -8       NA
## # ... with 448 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Commutativity

Arranging a smaller subset needs less work.

system.time(
  flights %>%
    arrange(arr_delay) %>% 
    filter(carrier == "UA")
)
##    user  system elapsed 
##   0.207   0.017   0.223
system.time(
  flights %>%
    filter(carrier == "UA") %>%
    arrange(arr_delay)
)
##    user  system elapsed 
##   0.039   0.008   0.047

Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.