Select first five variables

Find three ways to select the first five variables from the flights dataset.

flights %>% 
  select(___, ___, ________)

flights %>% 
  select(___:___)

flights %>% 
  select(___:___)

► Solution:

flights %>%
  select(year, month, day, dep_time, sched_dep_time)
## # A tibble: 336,776 x 5
##     year month   day dep_time sched_dep_time
##    <int> <int> <int>    <int>          <int>
##  1  2013     1     1      517            515
##  2  2013     1     1      533            529
##  3  2013     1     1      542            540
##  4  2013     1     1      544            545
##  5  2013     1     1      554            600
##  6  2013     1     1      554            558
##  7  2013     1     1      555            600
##  8  2013     1     1      557            600
##  9  2013     1     1      557            600
## 10  2013     1     1      558            600
## # ... with 336,766 more rows
flights %>%
  select(year:sched_dep_time)
## # A tibble: 336,776 x 5
##     year month   day dep_time sched_dep_time
##    <int> <int> <int>    <int>          <int>
##  1  2013     1     1      517            515
##  2  2013     1     1      533            529
##  3  2013     1     1      542            540
##  4  2013     1     1      544            545
##  5  2013     1     1      554            600
##  6  2013     1     1      554            558
##  7  2013     1     1      555            600
##  8  2013     1     1      557            600
##  9  2013     1     1      557            600
## 10  2013     1     1      558            600
## # ... with 336,766 more rows
## Numeric indexes work, too

flights %>%
  select(1:5)
## # A tibble: 336,776 x 5
##     year month   day dep_time sched_dep_time
##    <int> <int> <int>    <int>          <int>
##  1  2013     1     1      517            515
##  2  2013     1     1      533            529
##  3  2013     1     1      542            540
##  4  2013     1     1      544            545
##  5  2013     1     1      554            600
##  6  2013     1     1      554            558
##  7  2013     1     1      555            600
##  8  2013     1     1      557            600
##  9  2013     1     1      557            600
## 10  2013     1     1      558            600
## # ... with 336,766 more rows

Exclude the date

Find three ways to exclude the date of the flight.

flights %>% 
  select(___, ___, ______________________)

flights %>% 
  select(-___, -___, -___)

flights %>% 
  select(-___:-___)

► Solution:

flights %>%
  select(-year, -month, -day)
## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1      517            515         2      830            819        11
##  2      533            529         4      850            830        20
##  3      542            540         2      923            850        33
##  4      544            545        -1     1004           1022       -18
##  5      554            600        -6      812            837       -25
##  6      554            558        -4      740            728        12
##  7      555            600        -5      913            854        19
##  8      557            600        -3      709            723       -14
##  9      557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  select(-year:-day)
## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1      517            515         2      830            819        11
##  2      533            529         4      850            830        20
##  3      542            540         2      923            850        33
##  4      544            545        -1     1004           1022       -18
##  5      554            600        -6      812            837       -25
##  6      554            558        -4      740            728        12
##  7      555            600        -5      913            854        19
##  8      557            600        -3      709            723       -14
##  9      557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## Numeric indexes work, too

flights %>%
  select(-1:-3)
## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1      517            515         2      830            819        11
##  2      533            529         4      850            830        20
##  3      542            540         2      923            850        33
##  4      544            545        -1     1004           1022       -18
##  5      554            600        -6      812            837       -25
##  6      554            558        -4      740            728        12
##  7      555            600        -5      913            854        19
##  8      557            600        -3      709            723       -14
##  9      557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Departure variables

Select all variables related to departure.

flights %>% 
  select(___, ___, _______)

flights %>% 
  select(starts_with("___"))

► Solution:

## contains()

flights %>% 
  select(contains("dep_"))
## # A tibble: 336,776 x 3
##    dep_time sched_dep_time dep_delay
##       <int>          <int>     <dbl>
##  1      517            515         2
##  2      533            529         4
##  3      542            540         2
##  4      544            545        -1
##  5      554            600        -6
##  6      554            558        -4
##  7      555            600        -5
##  8      557            600        -3
##  9      557            600        -3
## 10      558            600        -2
## # ... with 336,766 more rows

Move departure variables to end

Move the variables related to scheduled time to the end of the table.

flights %>% 
  select(-___, -___, _______, everything(), ___, ___)

► Solution:

## everything()

flights %>%
  select(-contains("dep_"), everything(), contains("dep_"))
## # A tibble: 336,776 x 19
##     year month   day arr_time sched_arr_time arr_delay carrier flight
##    <int> <int> <int>    <int>          <int>     <dbl> <chr>    <int>
##  1  2013     1     1      830            819        11 UA        1545
##  2  2013     1     1      850            830        20 UA        1714
##  3  2013     1     1      923            850        33 AA        1141
##  4  2013     1     1     1004           1022       -18 B6         725
##  5  2013     1     1      812            837       -25 DL         461
##  6  2013     1     1      740            728        12 UA        1696
##  7  2013     1     1      913            854        19 B6         507
##  8  2013     1     1      709            723       -14 EV        5708
##  9  2013     1     1      838            846        -8 B6          79
## 10  2013     1     1      753            745         8 AA         301
## # ... with 336,766 more rows, and 11 more variables: tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, dep_time <int>, sched_dep_time <int>,
## #   dep_delay <dbl>

Contour plot

Create a contour plot of departure and arrival time. Rename the columns to show prettily in the plot. Restrict the plot to all flights that arrive before 5:00 AM.

flights %>%
  select(`___` = ___, `___` = ___, ___) %>%
  filter(___) %>%
  ggplot(aes(x = `___`, y = `___`)) +
  geom_density2d()

► Solution:

## Overriding label names

flights %>% 
  ggplot() +
  geom_density2d(aes(dep_time, arr_time)) +
  scale_x_continuous(name = "Departure time") +
  scale_y_continuous(name = "Arrival time")
## Warning: Removed 8713 rows containing non-finite values (stat_density2d).

## Renaming in the data

flights %>% 
  rename(`Departure time` = dep_time) %>%
  rename(`Arrival time` = arr_time) %>%
  ggplot() +
  geom_density2d(aes(`Departure time`, `Arrival time`))
## Warning: Removed 8713 rows containing non-finite values (stat_density2d).

flights %>% 
  filter(arr_time < 500) %>%
  rename(`Departure time` = dep_time) %>%
  rename(`Arrival time` = arr_time) %>%
  ggplot() +
  geom_density2d(aes(`Departure time`, `Arrival time`))

## Use coord_fixed() for fixing axes
flights %>% 
  filter(arr_time < 500) %>%
  rename(`Departure time` = dep_time) %>%
  rename(`Arrival time` = arr_time) %>%
  ggplot() +
  geom_density2d(aes(`Departure time`, `Arrival time`)) +
  coord_fixed()

More exercises

Find more exercises in Section 5.4.1 of r4ds.

Copyright © 2018 Kirill Müller. Licensed under CC BY-NC 4.0.