2 Function basics

Structuring the code to avoid too much copy-pasting

This chapter discusses functions as building blocks for more expressive and more powerful data analysis code.

2.1 Definition and execution

The following packages are used for this chapter.

library(tidyverse)
library(here)

Create functions for tasks that need to be executed repeatedly, or to hide implementation details.

read_weather_data <- function() {
  # Read all files
  berlin <- readxl::read_excel(here("data/weather", "berlin.xlsx"))
  toronto <- readxl::read_excel(here("data/weather", "toronto.xlsx"))
  tel_aviv <- readxl::read_excel(here("data/weather", "tel_aviv.xlsx"))
  zurich <- readxl::read_excel(here("data/weather", "zurich.xlsx"))

  # Create ensemble dataset
  weather_data <- bind_rows(
    berlin = berlin,
    toronto = toronto,
    tel_aviv = tel_aviv,
    zurich = zurich,
    .id = "city_code"
  )

  # Return it
  weather_data
}

Display the code of any function by writing its name without the subsequent parentheses:

read_weather_data

## function() {
##   # Read all files
##   berlin <- readxl::read_excel(here("data/weather", "berlin.xlsx"))
##   toronto <- readxl::read_excel(here("data/weather", "toronto.xlsx"))
##   tel_aviv <- readxl::read_excel(here("data/weather", "tel_aviv.xlsx"))
##   zurich <- readxl::read_excel(here("data/weather", "zurich.xlsx"))
## 
##   # Create ensemble dataset
##   weather_data <- bind_rows(
##     berlin = berlin,
##     toronto = toronto,
##     tel_aviv = tel_aviv,
##     zurich = zurich,
##     .id = "city_code"
##   )
## 
##   # Return it
##   weather_data
## }
## <environment: 0x36a96e8>

Call the function by adding the parentheses:

read_weather_data()

## # A tibble: 196 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 193 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

Execution of the function does not create new variables in the global environment. The only object in the global environment is the function itself:

ls()

## [1] "read_weather_data"

A function can also be used as input for a pipe:

read_weather_data() %>%
  count(city_code)

## # A tibble: 4 x 2
##   city_code     n
##   <chr>     <int>
## 1 berlin       49
## 2 tel_aviv     49
## 3 toronto      49
## 4 zurich       49

To reuse a function value, assign it to a variable:

weather_data <- read_weather_data()

2.1.1 Exercises

Create a modified version of the function to return only data for Toronto and Tel Aviv. Call it.

read_weather_data_non_europe <- function() {
  _______
}

_______

## # A tibble: 98 x 19
 ##   city_code time                summary icon  precipIntensity
 ##   <chr>     <dttm>              <chr>   <chr>           <dbl>
 ## 1 toronto   2019-04-28 15:00:00 Partly… part…               0
 ## 2 toronto   2019-04-28 16:00:00 Clear   clea…               0
 ## 3 toronto   2019-04-28 17:00:00 Clear   clea…               0
 ## # … with 95 more rows, and 14 more variables: precipProbability <dbl>,
 ## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
 ## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
 ## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
 ## #   ozone <dbl>, precipType <chr>

Compute number of rows for Europe, count observations to validate:
```
nrow(_____) - nrow(_____)
```
```
## [1] 98
```

2.2 Arguments

Click here to show setup code.

library(tidyverse)
library(here)

By adding arguments to your functions, you can turn them into tools for a wide range of applications. But it is advisable to be conservative here: try to minimise the number of arguments to the necessary ones, so the user has a clear and intuitive interface to deal with.

Functions with arguments:

weather_path <- function(filename) {
  # Returned value
  here("data/weather", filename)
}

weather_path("milan.xlsx")

## [1] "/home/travis/build/krlmlr/tidyprog/data/weather/milan.xlsx"

Call functions from within functions:

read_weather_data <- function() {
  # Read all files
  berlin <- readxl::read_excel(weather_path("berlin.xlsx"))
  toronto <- readxl::read_excel(weather_path("toronto.xlsx"))
  tel_aviv <- readxl::read_excel(weather_path("tel_aviv.xlsx"))
  zurich <- readxl::read_excel(weather_path("zurich.xlsx"))

  # Create ensemble dataset
  weather_data <- bind_rows(
    berlin = berlin,
    toronto = toronto,
    tel_aviv = tel_aviv,
    zurich = zurich,
    .id = "city_code"
  )

  # Return it
  weather_data
}

The function still needs to be called for testing it. It is a good practice to always immediately test a the newly created or updated function by running it:

read_weather_data()

## # A tibble: 196 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 193 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

2.2.1 Exercises

How does the behavior of read_weather_data() change if we update the definition of the read_weather() function as follows:
```
weather_path <- function(filename) {
  # Returned value
  here("data", "weather", filename)
}
```
Hint: Define this function with a different name and check its output values, before running read_weather_data() again.

2.3 Use case: Intermediate variables

Click here to show setup code.

library(tidyverse)
library(here)

weather_path <- function(filename) {
  # Returned value
  here("data/weather", filename)
}

We start with the function weather_path() from section “Arguments”.

Functions can help to avoid having to use intermediate variables:

read_weather_file <- function(filename) {
  readxl::read_excel(weather_path(filename))
}

read_weather_data <- function() {
  # Create ensemble dataset from files on disk
  weather_data <- bind_rows(
    berlin = read_weather_file("berlin.xlsx"),
    toronto = read_weather_file("toronto.xlsx"),
    tel_aviv = read_weather_file("tel_aviv.xlsx"),
    zurich = read_weather_file("zurich.xlsx"),
    .id = "city_code"
  )

  # Return it
  weather_data
}

read_weather_data()

## # A tibble: 196 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 193 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

2.3.1 Exercises

Implement a helper function get_weather_file_for() that takes a city code as input and returns the file name for the corresponding Excel file. Intended usage: get_weather_file_for("berlin"). Test this function on a few example inputs.
```
get_weather_file_for <- _____ {
  paste0(city_code, ".xlsx")
}
```
```
get_weather_file_for("munich")
```
```
## [1] "munich.xlsx"
```
```
get_weather_file_for("san_diego")
```
```
## [1] "san_diego.xlsx"
```

Implement a helper function get_weather_data_for() that takes a city code as input (as opposed to a file name). Intended usage: get_weather_data_for("berlin"). Update read_weather_data() to use get_weather_data_for().

get_weather_data_for <- _____ {
  read_weather_file(_____)
}

get_weather_data_for("toronto")

## # A tibble: 49 x 18
 ##   time                summary icon  precipIntensity precipProbabili…
 ##   <dttm>              <chr>   <chr>           <dbl>            <dbl>
 ## 1 2019-04-28 15:00:00 Partly… part…               0                0
 ## 2 2019-04-28 16:00:00 Clear   clea…               0                0
 ## 3 2019-04-28 17:00:00 Clear   clea…               0                0
 ## # … with 46 more rows, and 13 more variables: temperature <dbl>,
 ## #   apparentTemperature <dbl>, dewPoint <dbl>, humidity <dbl>,
 ## #   pressure <dbl>, windSpeed <dbl>, windGust <dbl>, windBearing <dbl>,
 ## #   cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>, ozone <dbl>,
 ## #   precipType <chr>

2.4 Default values

Click here to show setup code.

library(tidyverse)
library(here)


weather_path <- function(filename) {
  # Returned value
  here("data/weather", filename)
}

read_weather_file <- function(filename) {
  readxl::read_excel(weather_path(filename))
}

get_weather_file_for <- function(city_code) {
  paste0(city_code, ".xlsx")
}

get_weather_data_for <- function(city_code) {
  read_weather_file(get_weather_file_for(city_code))
}

For user-friendliness it is often good practice to provide default values for parameters

We start with the functionget_weather_data_for() from section “Intermediate variables”.

Here an example of a boolean argument which when TRUE leads to dropping the data about Zurich.

read_weather_data <- function(omit_zurich = FALSE) {
  # Create ensemble dataset from files on disk
  weather_data <- bind_rows(
    berlin = get_weather_data_for("berlin"),
    toronto = get_weather_data_for("toronto"),
    tel_aviv = get_weather_data_for("tel_aviv"),
    zurich = get_weather_data_for("zurich"),
    .id = "city_code"
  )

  # Return it (filtered)
  weather_data %>%
    filter( !(city_code == "zurich" & omit_zurich) )
}

Set arguments with default values explicitly with or without using the name or leave them out to use the default value:

read_weather_data(TRUE)

## # A tibble: 147 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 144 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

read_weather_data(omit_zurich = TRUE)

## # A tibble: 147 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 144 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

read_weather_data()

## # A tibble: 196 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 193 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

2.4.1 Exercises

Update get_weather_data_for() to return Zurich data if called without arguments. Is this a good idea?

get_weather_data_for <- _____ {
  _____
}

get_weather_data_for() %>% 
  select(temperature)

## # A tibble: 49 x 1
 ##   temperature
 ##         <dbl>
 ## 1        6.96
 ## 2        7.14
 ## 3        7.32
 ## # … with 46 more rows

get_weather_data_for("tel_aviv") %>% 
  select(temperature)

## # A tibble: 49 x 1
 ##   temperature
 ##         <dbl>
 ## 1        23.9
 ## 2        23.1
 ## 3        22.4
 ## # … with 46 more rows

2.5 Multiple arguments

Click here to show setup code.

library(tidyverse)
library(here)


weather_path <- function(filename) {
  # Returned value
  here("data/weather", filename)
}

read_weather_file <- function(filename) {
  readxl::read_excel(weather_path(filename))
}

get_weather_file_for <- function(city_code) {
  paste0(city_code, ".xlsx")
}

get_weather_data_for <- function(city_code) {
  read_weather_file(get_weather_file_for(city_code))
}

We start once more with the functions weather_path() from section “Arguments” and get_weather_data_for() from section “Intermediate variables”.

What are the considerations when using multiple function arguments? You can add new parameters in a very straightforward manner like this:

read_weather_data <- function(omit_zurich = FALSE, omit_toronto = FALSE) {
  # Create ensemble dataset from files on disk
  weather_data <- bind_rows(
    berlin = get_weather_data_for("berlin"),
    toronto = get_weather_data_for("toronto"),
    tel_aviv = get_weather_data_for("tel_aviv"),
    zurich = get_weather_data_for("zurich"),
    .id = "city_code"
  )

  # Return it (filtered)
  weather_data %>%
    filter( !(city_code == "zurich" & omit_zurich) ) %>%
    filter( !(city_code == "toronto" & omit_toronto) )
}

Prefer passing arguments by name rather than only giving the value, especially if the intent of the value is not clear from just reading it.

# Good:
read_weather_data(omit_zurich = TRUE)

## # A tibble: 147 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 144 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

read_weather_data(omit_toronto = TRUE)

## # A tibble: 147 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 144 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

# Bad:
read_weather_data(TRUE)

## # A tibble: 147 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 144 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

Use the so called ellipsis (...) when you want to provide the possibility for the user to call your function with a list of arguments of unspecified length. This can be e.g. useful for passing arguments downstream:

weather_path <- function(...) {
  # All arguments are passed on
  here("data/weather", ...)
}

weather_path("berlin.xlsx")

## [1] "/home/travis/build/krlmlr/tidyprog/data/weather/berlin.xlsx"

weather_path("some", "subdir", "with", "a", "file.csv")

## [1] "/home/travis/build/krlmlr/tidyprog/data/weather/some/subdir/with/a/file.csv"

Mind, that despite altering the original function and adding new features to it, the original call still works as before:

read_weather_data()

## # A tibble: 196 x 19
##   city_code time                summary icon  precipIntensity
##   <chr>     <dttm>              <chr>   <chr>           <dbl>
## 1 berlin    2019-04-28 15:00:00 Mostly… part…               0
## 2 berlin    2019-04-28 16:00:00 Mostly… part…               0
## 3 berlin    2019-04-28 17:00:00 Mostly… part…               0
## # … with 193 more rows, and 14 more variables: precipProbability <dbl>,
## #   temperature <dbl>, apparentTemperature <dbl>, dewPoint <dbl>,
## #   humidity <dbl>, pressure <dbl>, windSpeed <dbl>, windGust <dbl>,
## #   windBearing <dbl>, cloudCover <dbl>, uvIndex <dbl>, visibility <dbl>,
## #   ozone <dbl>, precipType <chr>

2.5.1 Exercises

What does the following return? Why?
```
read_weather_data(TRUE, omit_z = FALSE) %>%
  count(city_code)
```
See the next section for ideas on avoiding this behavior.

2.6 Argument matching

Click here to show setup code.

How does R handle function calls with arguments?

Named arguments are assigned first, after that remaining slots are filled from left to right.

use_names <- function(one = 1, two = 2) {
  list(one = one, two = two)
}

use_names(3, 4)

## $one
## [1] 3
## 
## $two
## [1] 4

use_names(one = 3, 4)

## $one
## [1] 3
## 
## $two
## [1] 4

use_names(3, one = 4)

## $one
## [1] 4
## 
## $two
## [1] 3

use_names(one = 3, two = 4)

## $one
## [1] 3
## 
## $two
## [1] 4

use_names(two = 3, one = 4)

## $one
## [1] 4
## 
## $two
## [1] 3

Arguments are matched partially, which can be convenient but is also a source of errors.

use_names(o = 3, 4)

## $one
## [1] 3
## 
## $two
## [1] 4

use_names(3, o = 4)

## $one
## [1] 4
## 
## $two
## [1] 3

use_names(o = 3, t = 4)

## $one
## [1] 3
## 
## $two
## [1] 4

use_names(t = 3, o = 4)

## $one
## [1] 4
## 
## $two
## [1] 3

The ellipsis can be used to enforce the user to fully name the function parameters when setting them:

only_names <- function(..., one = 1, two = 2) {
  list(one = one, two = two)
}

only_names(3, 4)

## $one
## [1] 1
## 
## $two
## [1] 2

only_names(one = 3, 4)

## $one
## [1] 3
## 
## $two
## [1] 2

only_names(one = 3, two = 4)

## $one
## [1] 3
## 
## $two
## [1] 4

only_names(o = 3, t = 4)

## $one
## [1] 1
## 
## $two
## [1] 2

Inside a function with an ellipsis as a parameter, you can capture the ellipsis with list():

ellipsis_test <- function(...) {
  args <- list(...)
  names(args)
}

ellipsis_test(a = 1, 2, c = 3:5)

## [1] "a" ""  "c"

Arguments in ellipsis can be accessed with ..1, ..2 etc.

ellipsis_direct_test <- function(...) {
  list(..1, ..2)
}

ellipsis_direct_test(a = 1, 2, c = 3:5)

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2

2.6.1 Exercises

Naming, not naming or partly naming parameters in functions calls? What does the following return and why?

use_some_names <- function(one = 1, ..., two = 2) {
  list(one = one, two = two)
}

use_some_names(3, 4)
use_some_names(one = 3, 4)
use_some_names(3, one = 4)
use_some_names(one = 3, two = 4)
use_some_names(two = 4, 3)

Model a new enforce_names() function after only_names() to check if any unnamed or misnamed arguments have been used. Test this function in various combinations.

enforce_names <- function(..., one = 1, two = 2) {
  extra_args <- _____
  stopifnot(length(_____) == 0)

  list(_____)
}

try(enforce_names(3, 4))

## Error in enforce_names(3, 4) : length(extra_args) == 0 is not TRUE

try(enforce_names(one = 3, 4))

## Error in enforce_names(one = 3, 4) : length(extra_args) == 0 is not TRUE

try(enforce_names(3, one = 4))

## Error in enforce_names(3, one = 4) : length(extra_args) == 0 is not TRUE

try(enforce_names(two = 4, 3))

## Error in enforce_names(two = 4, 3) : length(extra_args) == 0 is not TRUE

try(enforce_names(o = 3, t = 4))

## Error in enforce_names(o = 3, t = 4) : 
##   length(extra_args) == 0 is not TRUE

enforce_names(one = 3, two = 4)

## $one
## [1] 3
## 
## $two
## [1] 4