Be careful with the equality operator ==
:
flights %>%
filter(month = 6, day = 2)
## Error: `month` (`month = 6`), `day` (`day = 2`) must not be named, do you need `==`?
flights %>%
filter(month == 6, day == 2)
## # A tibble: 911 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 6 2 14 2359 15 339
## 2 2013 6 2 20 2155 145 222
## 3 2013 6 2 24 2245 99 133
## 4 2013 6 2 33 2059 214 150
## 5 2013 6 2 35 2130 185 332
## 6 2013 6 2 36 1914 322 223
## 7 2013 6 2 44 2359 45 420
## 8 2013 6 2 128 2159 209 325
## 9 2013 6 2 131 2146 225 229
## 10 2013 6 2 219 2055 324 322
## # ... with 901 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Can we make a dynamic query?
flights %>%
filter(
month == lubridate::month(Sys.Date()),
day == lubridate::day(Sys.Date())
)
## # A tibble: 976 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 13 1 2359 2 442
## 2 2013 11 13 458 500 -2 632
## 3 2013 11 13 538 545 -7 809
## 4 2013 11 13 544 515 29 817
## 5 2013 11 13 549 600 -11 831
## 6 2013 11 13 551 600 -9 638
## 7 2013 11 13 551 600 -9 729
## 8 2013 11 13 552 600 -8 717
## 9 2013 11 13 552 600 -8 805
## 10 2013 11 13 553 600 -7 806
## # ... with 966 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Can we use arguments?
month_ <- lubridate::month(Sys.Date())
day_ <- lubridate::day(Sys.Date())
flights %>%
filter(
month == month_,
day == day_
)
## # A tibble: 976 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 13 1 2359 2 442
## 2 2013 11 13 458 500 -2 632
## 3 2013 11 13 538 545 -7 809
## 4 2013 11 13 544 515 29 817
## 5 2013 11 13 549 600 -11 831
## 6 2013 11 13 551 600 -9 638
## 7 2013 11 13 551 600 -9 729
## 8 2013 11 13 552 600 -8 717
## 9 2013 11 13 552 600 -8 805
## 10 2013 11 13 553 600 -7 806
## # ... with 966 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
filter(dep_time >= 800, dep_time <= 2200)
## # A tibble: 267,608 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 800 800 0 1022
## 2 2013 1 1 800 810 -10 949
## 3 2013 1 1 801 805 -4 900
## 4 2013 1 1 803 810 -7 903
## 5 2013 1 1 803 800 3 1132
## 6 2013 1 1 804 810 -6 1103
## 7 2013 1 1 805 805 0 1015
## 8 2013 1 1 805 800 5 1118
## 9 2013 1 1 805 815 -10 1006
## 10 2013 1 1 807 810 -3 1043
## # ... with 267,598 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
filter(between(dep_time, 800, 2200))
## # A tibble: 267,608 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 800 800 0 1022
## 2 2013 1 1 800 810 -10 949
## 3 2013 1 1 801 805 -4 900
## 4 2013 1 1 803 810 -7 903
## 5 2013 1 1 803 800 3 1132
## 6 2013 1 1 804 810 -6 1103
## 7 2013 1 1 805 805 0 1015
## 8 2013 1 1 805 800 5 1118
## 9 2013 1 1 805 815 -10 1006
## 10 2013 1 1 807 810 -3 1043
## # ... with 267,598 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
filter(month %in% c(12, 1, 2))
## # A tibble: 80,090 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 80,080 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
winter_months <- c(12, 1, 2)
flights %>%
filter(month %in% winter_months)
## # A tibble: 80,090 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 80,080 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
filter(dep_time > arr_time)
## # A tibble: 10,633 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 1929 1920 9 3
## 2 2013 1 1 1939 1840 59 29
## 3 2013 1 1 2058 2100 -2 8
## 4 2013 1 1 2102 2108 -6 146
## 5 2013 1 1 2108 2057 11 25
## 6 2013 1 1 2120 2130 -10 16
## 7 2013 1 1 2121 2040 41 6
## 8 2013 1 1 2128 2135 -7 26
## 9 2013 1 1 2134 2045 49 20
## 10 2013 1 1 2136 2145 -9 25
## # ... with 10,623 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
arrange(air_time) %>%
head(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 16 1355 1315 40 1442
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>%
arrange(arr_delay) %>%
tail(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 9 30 NA 840 NA NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Why doesn’t this give the result we’re looking for? Can we use a filter?
flights %>%
filter(!is.na(arr_delay)) %>%
arrange(arr_delay) %>%
tail(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Or the pattern below?
flights %>%
arrange(!is.na(arr_delay), arr_delay) %>%
tail(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Usually it’s easiest to sort in descending order:
flights %>%
arrange(-arr_delay) %>%
head(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>%
arrange(desc(arr_delay)) %>%
head(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>%
arrange(desc(air_time)) %>%
head(1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 3 17 1337 1335 2 1937
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
If we filter first, fewer observations need to be sorted.
flights %>%
filter(carrier == "UA") %>%
arrange(arr_delay)
## # A tibble: 58,665 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 5 2 1947 1949 -2 2209
## 2 2013 5 2 1926 1929 -3 2157
## 3 2013 5 7 2054 2055 -1 2317
## 4 2013 2 26 1335 1335 0 1819
## 5 2013 2 26 1721 1725 -4 1936
## 6 2013 2 28 702 705 -3 924
## 7 2013 5 13 1624 1629 -5 1831
## 8 2013 5 4 1914 1915 -1 2107
## 9 2013 12 27 853 856 -3 1052
## 10 2013 3 1 629 632 -3 844
## # ... with 58,655 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
arrange(arr_delay) %>%
filter(carrier == "UA")
## # A tibble: 58,665 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 5 2 1947 1949 -2 2209
## 2 2013 5 2 1926 1929 -3 2157
## 3 2013 5 7 2054 2055 -1 2317
## 4 2013 2 26 1335 1335 0 1819
## 5 2013 2 26 1721 1725 -4 1936
## 6 2013 2 28 702 705 -3 924
## 7 2013 5 13 1624 1629 -5 1831
## 8 2013 5 4 1914 1915 -1 2107
## 9 2013 12 27 853 856 -3 1052
## 10 2013 3 1 629 632 -3 844
## # ... with 58,655 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
arrange(dep_delay - arr_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 658 700 -2 1329
## 2 2013 4 18 558 600 -2 1149
## 3 2013 8 8 1819 1519 180 5
## 4 2013 7 10 1916 1900 16 137
## 5 2013 6 27 1608 1525 43 2045
## 6 2013 7 22 1606 1615 -9 2056
## 7 2013 7 1 811 800 11 1344
## 8 2013 7 10 2011 1520 291 2357
## 9 2013 7 22 1626 1545 41 2051
## 10 2013 4 18 655 700 -5 1213
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.