flights %>%
mutate(miles_per_hour = distance / air_time * 60)
## # A tibble: 336,776 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, miles_per_hour <dbl>
flights %>%
mutate(miles_per_minute = distance / air_time) %>%
mutate(miles_per_hour = miles_per_minute * 60) %>%
select(-miles_per_minute)
## # A tibble: 336,776 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, miles_per_hour <dbl>
flights %>%
mutate(miles_per_hour = distance / air_time * 60) %>%
ggplot() +
geom_histogram(
aes(miles_per_hour),
na.rm = TRUE,
binwidth = 20
)
flights %>%
ggplot() +
geom_histogram(aes(distance / air_time))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9430 rows containing non-finite values (stat_bin).
flights %>%
mutate(
on_time = (arr_delay <= 0),
on_time_desc = if_else(on_time, "On time", "Delayed")
)
## # A tibble: 336,776 x 21
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, on_time <lgl>, on_time_desc <chr>
flights %>%
mutate(
on_time = (arr_delay <= 0),
on_time_desc = if_else(on_time, "On time", "Delayed")
) %>%
ggplot(aes(x = carrier, fill = on_time_desc)) +
geom_bar()
speed_and_on_time_info <-
flights %>%
mutate(
miles_per_minute = distance / air_time,
miles_per_hour = miles_per_minute * 60
) %>%
select(-miles_per_minute) %>%
mutate(
on_time = (arr_delay <= 0),
on_time_desc = if_else(on_time, "On time", "Delayed")
) %>%
select(-on_time)
speed_and_on_time_info %>%
ggplot() +
geom_freqpoly(
aes(x = miles_per_hour, y = ..density.., color = on_time_desc),
na.rm = TRUE,
binwidth = 20
)
speed_and_on_time_info %>%
filter(!is.na(on_time_desc)) %>%
ggplot() +
geom_histogram(
aes(x = miles_per_hour),
na.rm = TRUE,
binwidth = 20
) +
facet_wrap(~on_time_desc, ncol = 1)
flights %>%
mutate(
date_hour = as.Date(time_hour, tz = "EST"),
date_ymd = lubridate::make_date(year, month, day)
) %>%
filter(date_hour != date_ymd)
## # A tibble: 0 x 21
## # ... with 21 variables: year <int>, month <int>, day <int>,
## # dep_time <int>, sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # date_hour <date>, date_ymd <date>
flights %>%
mutate(dev = dep_delay - mean(dep_delay, na.rm = TRUE)) %>%
ggplot() +
geom_violin(aes(x = origin, y = dev), na.rm = TRUE)
Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.