Late flights

View all flights that arrived after 10:00 PM. Use an intermediate variable, a nested expression, and the pipe. Which appeals more to you?

flights_after_10 <- filter(flights, ___)
view(flights_after_10)
view(filter(flights, ___))
flights %>%
  filter(___) %>%
  view()

Fly United

Extend the four solutions to view all "UA" flights that arrived after 10:00 PM.

flights_after_10 <- filter(flights, ___)
ua_flights_after_10 <- ...
view(___)
view(filter(filter(flights, ___), ___))
flights %>%
  filter(___) %>%
  filter(___) %>%
  view()

Ad infinitum, 1

Extend the four solutions to view all "UA" flights that departed before 6:00 PM and arrived after 10:00 PM.

Ad infinitum, 2

Extend the four solutions to view all "UA" flights that departed before 6:00 PM and arrived after 10:00 PM and had a delay of more than two hours.

Ad infinitum, 3

Extend the four solutions to view all "UA" flights that departed before 6:00 PM and arrived after 10:00 PM and had a delay of more than two hours, originating in one of New York City’s airports.

Ad infinitum, 4

Extend the four solutions to view all "UA" flights that departed before 6:00 PM and arrived after 10:00 PM and had a delay of more than two hours, originating in one of New York City’s airports but excluding Honolulu International airport.

Hint: Use dest != "HNL" as predicate.

Ad infinitum, 5

Sort the result by distance.

► Solution: ### Intermediate variables

Naming is hard!

late_flights <-
  filter(flights, arr_time >= 2200)
late_ua_flights <-
  filter(late_flights, carrier == "UA")
early_late_ua_flights <-
  filter(late_ua_flights, dep_time < 1800)
early_late_late_ua_flights <-
  filter(early_late_ua_flights, arr_delay > 120)
early_late_late_ua_flights_not_honolulu <-
  filter(early_late_late_ua_flights, dest != "HNL")
early_late_late_ua_flights_not_honolulu_sorted <-
  arrange(
    early_late_late_ua_flights_not_honolulu,
    distance
  )
view(early_late_late_ua_flights_not_honolulu_sorted)
## # A tibble: 7 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     4    10     1740           1716        24     2216
## 2  2013     7     7     1756           1710        46     2230
## 3  2013     8     8     1754           1710        44     2213
## 4  2013     8     9     1734           1710        24     2209
## 5  2013     7    28     1747           1505       162     2220
## 6  2013     8     8     1753           1609       104     2235
## 7  2013     7     7     1734           1540       114     2218
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Nested expressions

Difficult to read.

view(
  arrange(
    filter(
      filter(
        filter(
          filter(
            filter(
              flights,
              arr_time >= 2200
            ),
            carrier == "UA"
          ),
          dep_time < 1800
        ),
        arr_delay > 120
      ),
      dest != "HNL"
    ),
    distance
  )
)
## # A tibble: 7 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     4    10     1740           1716        24     2216
## 2  2013     7     7     1756           1710        46     2230
## 3  2013     8     8     1754           1710        44     2213
## 4  2013     8     9     1734           1710        24     2209
## 5  2013     7    28     1747           1505       162     2220
## 6  2013     8     8     1753           1609       104     2235
## 7  2013     7     7     1734           1540       114     2218
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Pipe

flights %>% 
  filter(arr_time >= 2200) %>% 
  filter(carrier == "UA") %>% 
  filter(dep_time < 1800) %>% 
  filter(arr_delay > 120) %>% 
  filter(dest != "HNL") %>%
  arrange(distance) %>%
  view()
## # A tibble: 7 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     4    10     1740           1716        24     2216
## 2  2013     7     7     1756           1710        46     2230
## 3  2013     8     8     1754           1710        44     2213
## 4  2013     8     9     1734           1710        24     2209
## 5  2013     7    28     1747           1505       162     2220
## 6  2013     8     8     1753           1609       104     2235
## 7  2013     7     7     1734           1540       114     2218
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

The original data is never updated! You still need to assign the result of a pipe to a variable:

flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
late_late_ua_flights_not_honolulu <-
  flights %>% 
  filter(arr_time >= 2200) %>% 
  filter(carrier == "UA") %>% 
  filter(dep_time < 1800) %>% 
  filter(arr_delay > 120) %>% 
  filter(dest != "HNL") %>%
  arrange(distance)

late_late_ua_flights_not_honolulu
## # A tibble: 7 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     4    10     1740           1716        24     2216
## 2  2013     7     7     1756           1710        46     2230
## 3  2013     8     8     1754           1710        44     2213
## 4  2013     8     9     1734           1710        24     2209
## 5  2013     7    28     1747           1505       162     2220
## 6  2013     8     8     1753           1609       104     2235
## 7  2013     7     7     1734           1540       114     2218
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Copyright © 2019 Kirill Müller. Licensed under CC BY-NC 4.0.