Apply functions
sapply
- Use function to find mass for one volume
est_mass <- function(volume){
mass <- 2.65 * volume^0.9
return(mass)
}
shrub_vol1 <- 1.6
est_mass(shrub_vol1)
- Easy to find mass for other volumes
shrub_vol2 <- 5.6
est_mass(shrub_vol2)
shrub_vol3 <- 3.1
est_mass(shrub_vol3)
- Typing this to get each shrub’s volume is tedious and error-prone
- Use
apply()
-type functions instead
shrub_vols <- c(1.6, 5.6, 3.1)
sapply(shrub_vols, est_mass)
- Do same action on many things with single line of code!
- Easily scales up
Do Task 1 of Use and Modify with Apply.
Other apply functions
- Handful of similar functions in
apply()
family - Differ depending on type of input and output data
lapply()
likesapply()
but returns list
lapply(shrub_vols, est_mass)
apply()
works on multi-dimensional datamapply()
for functions with multiple arguments
est_mass_type <- function(volume, veg_type){
if (veg_type == "tree"){
mass <- 2.65 * volume^0.9
} else {
mass <- NA
}
return(mass)
}
est_mass_type(1.6, "tree")
plant_vols <- c(1.6, 3, 8)
plant_types <- c("tree", "grass", "tree")
mapply(est_mass_type, volume = plant_vols, veg_type = plant_types)
- First argument is function, rest are function arguments
Do Task 2 of Use and Modify with Apply.
tidyverse version of apply
- Use
map
function frompurrr
package - Similar to apply
library(purrr)
map(plant_vols, est_mass)
- Use with pipes
library(dplyr)
plant_vols_df = data.frame(vols = plant_vols)
plant_vols_df %>%
filter(vols > 2) %>%
map(est_mass)
For loops
Set up R console:
library(stringr)
library(dplyr)
Basic for
loop
- Do same action to each component of a list
waterbirds <- c("cygnus olor", "aix sponsa", "anas acuta")
waterbird <- waterbirds[1]
print(waterbird)
waterbird <- waterbirds[2]
print(waterbird)
waterbird <- waterbirds[3]
print(waterbird)
- This is tedious
- Use for loop to do same action repeatedly
- Easier & fewer errors
for (item in list_of_items) {
do_something(item)
}
- Need
print()
to display values inside a loop, function, or conditional.
for (waterbird in waterbirds){
print(waterbird)
}
- Do more actions
for (waterbird in waterbirds){
waterbird_cap <- str_to_title(waterbird)
print(waterbird_cap)
}
Do Basic Vector.
Numeric values in for
loops
- Do functions or math as actions within for loops
- Variable can be given any name, then refer to with that name in loop
for (num in 100:150){
print(num * 10)
}
- Use
paste()
to put together strings and variables
for (num in 100:150){
print(paste("My favorite number is", num * 10))
}
Do Basic Index tasks 1-2.
Storing results
- Create an empty object.
output <- c()
- Iteratively add new values to object.
output <- c(1, 2, 3)
output <- c(output, 4)
- Use this method within a
for
loop to save outputs.
waterbirds_cap_list <- c()
for (waterbird in waterbirds){
waterbird_cap <- str_to_title(waterbird)
waterbirds_cap_list <- c(waterbirds_cap_list, waterbird_cap)
print(waterbirds_cap_list)
}
waterbirds_cap_list
Do Basic Index task 3.
Looping in data frames
- Loops go over columns of dataframes
waterbirds <- data.frame(sci_name = c("cygnus olor",
"aix sponsa",
"anas acuta"),
common_name = c("mute swan",
"wood duck",
"pintail"))
for (waterbird in waterbirds){
print("Start new loop")
print(waterbird)
}
- Can loop over rows of data frames using index
for (i in 1:nrow(waterbirds)){
print(i)
}
- Index can be any letter/word, i is convention
for (r in 1:nrow(waterbirds)){
print(r)
}
for (i in 1:nrow(waterbirds)){
print(waterbirds$sci_name[i])
}
for (i in 1:nrow(waterbirds)){
print(paste(waterbirds$sci_name[i], "is a",
waterbirds$common_name[i]))
}
- Less memory to create initial empty dataframe
- Creates copy of dataframe when adding rows
waterbirds_2 <- data.frame(capital_name = character(3),
name_length = numeric(3),
stringsAsFactors = FALSE)
for (i in 1:nrow(waterbirds)){
common_name_cap <- str_to_title(waterbirds$common_name[i])
sci_name_length <- str_length(waterbirds$sci_name[i])
waterbirds_2[i,] <- c(common_name_cap, sci_name_length)
}
Do stringr.
Looping over files
- Repeat same actions on many similar files
- Get names of satellite collar location files
download.file("http://www.datacarpentry.org/semester-biology/data/collar-data-2016-01.zip",
"collar_data.zip")
unzip("collar_data.zip")
collar_data_files = list.files(pattern = "collar-data-.*.txt",
full.names = TRUE)
-
Look at one of the files
-
Three ways to do same thing: get number of samples in each file
-
With loop
numbers_vector_1 <- c() for (data_file in collar_data_files){ file <- read.csv(data_file) number <- nrow(file) numbers_vector_1 <- c(numbers_vector_1, number) }
-
With function and loop
get_numbers <- function(data_file_name){ file <- read.csv(data_file_name) number <- nrow(file) return(number) } numbers_vector_2 <- c() for (data_file in collar_data_files){ numbers_vector_2 <- c(numbers_vector_2, get_numbers(data_file)) }
-
With function and
apply
numbers_vector_3 <- unlist(lapply(collar_data_files, get_numbers))
- How to choose when there are many ways to do the same thing?
- Speed
- Matters in few cases
- Hard to identify bottlenecks
- Readability
- Easy to understand
- Personal preference
- Speed
- There is no “right” way to do anything
Do Multiple Files.