- For/while loops to iterate over data
apply
map
,map_<type>
,map_at
,map_if
mutate_at
,mutate_if
summarize_at
,summarize_if
Fall 2020
apply
map
, map_<type>
, map_at
, map_if
mutate_at
, mutate_if
summarize_at
, summarize_if
# Our favourite library library(tidyverse) # For Cars93 data again Cars93 <- MASS::Cars93 # For the clean survey data: survey <- read.csv("http://www.andrew.cmu.edu/user/achoulde/94842/data/survey_data2020.csv", header=TRUE, stringsAsFactors = FALSE)
We’ll now learn about loops and some more efficient/syntactically simple loop alternatives
loops are ways of iterating over data
for(i in 1:4) { print(i) }
## [1] 1 ## [1] 2 ## [1] 3 ## [1] 4
phrase <- "Good Night," for(word in c("and", "Good", "Luck")) { phrase <- paste(phrase, word) print(phrase) }
## [1] "Good Night, and" ## [1] "Good Night, and Good" ## [1] "Good Night, and Good Luck"
A for loop executes a chunk of code for every value of an index variable in an index set
for(index.variable in index.set) { code to be repeated at every value of index.variable }
index.set <- list(name="Michael", weight=185, is.male=TRUE) # a list for(i in index.set) { print(c(i, typeof(i))) }
## [1] "Michael" "character" ## [1] "185" "double" ## [1] "TRUE" "logical"
fake.data <- matrix(rnorm(500), ncol=5) # create fake 100 x 5 data set head(fake.data,2) # print first two rows
## [,1] [,2] [,3] [,4] [,5] ## [1,] -0.7567250 0.2771538 -0.5701664 0.5888506 -0.8897146 ## [2,] 0.5451021 0.6679214 -0.5304780 -0.6796072 -0.5847928
col.sums <- numeric(ncol(fake.data)) # variable to store running column sums for(i in 1:nrow(fake.data)) { col.sums <- col.sums + fake.data[i,] # add ith observation to the sum } col.sums
## [1] 8.118425 6.569486 20.780103 -2.837246 5.183321
colSums(fake.data) # A better approach (see also colMeans())
## [1] 8.118425 6.569486 20.780103 -2.837246 5.183321
day <- 1 num.days <- 365 while(day <= num.days) { day <- day + 1 }
We won’t really be using while loops in this class
Just be aware that they exist, and that they may become useful to you at some point in your analytics career
Command | Description |
---|---|
apply(X, MARGIN, FUN) |
Obtain a vector/array/list by applying FUN along the specified MARGIN of an array or matrix X |
map(.x, .f, ...) |
Obtain a list by applying .f to every element of a list or atomic vector .x |
map_<type>(.x, .f, ...) |
For <type> given by lgl (logical), int (integer), dbl (double) or chr (character), return a vector of this type obtained by applying .f to each element of .x |
map_at(.x, .at, .f) |
Obtain a list by applying .f to the elements of .x specified by name or index given in .at |
map_if(.x, .p, .f) |
Obtain a list .f to the elements of .x specified by .p (a predicate function, or a logical vector) |
mutate_all/_at/_if |
Mutate all variables, specified (at) variables, or those selected by a predicate (if) |
summarize_all/_at/_if |
Summarize all variables, specified variables, or those selected by a predicate (if) |
These take practice to get used to, but make analysis easier to debug and less prone to error when used effectively
The best way to learn them is by looking at a bunch of examples. The end of each help file contains some examples.
colMeans(fake.data)
## [1] 0.08118425 0.06569486 0.20780103 -0.02837246 0.05183321
apply(fake.data, MARGIN=2, FUN=mean) # MARGIN = 1 for rows, 2 for columns
## [1] 0.08118425 0.06569486 0.20780103 -0.02837246 0.05183321
# Function that calculates proportion of vector indexes that are > 0 propPositive <- function(x) mean(x > 0) apply(fake.data, MARGIN=2, FUN=propPositive)
## [1] 0.50 0.53 0.50 0.52 0.55
map(survey, is.numeric) # Returns a list
## $Program ## [1] FALSE ## ## $PriorExp ## [1] FALSE ## ## $Rexperience ## [1] FALSE ## ## $OperatingSystem ## [1] FALSE ## ## $TVhours ## [1] TRUE ## ## $Editor ## [1] FALSE
map_lgl(survey, is.numeric) # Returns a logical vector with named elements
## Program PriorExp Rexperience OperatingSystem ## FALSE FALSE FALSE FALSE ## TVhours Editor ## TRUE FALSE
apply(cars, 2, FUN=mean) # Data frames are arrays
## speed dist ## 15.40 42.98
map(cars, mean) # Data frames are also lists
## $speed ## [1] 15.4 ## ## $dist ## [1] 42.98
map_dbl(cars, mean) # map output as a double vector
## speed dist ## 15.40 42.98
Let’s convert all factor variables in Cars93 to lowercase
head(Cars93$Type)
## [1] Small Midsize Compact Midsize Midsize Midsize ## Levels: Compact Large Midsize Small Sporty Van
Cars93.lower <- mutate_if(Cars93, is.factor, tolower) head(Cars93.lower$Type)
## [1] "small" "midsize" "compact" "midsize" "midsize" "midsize"
Cars93
data where all of the factor variables have been replaced with versions containing lowercase valuesIf you pass the functions in as a list with named elements, those names get appended to create modified versions of variables instead of replacing existing variables
Cars93.lower <- mutate_if(Cars93, is.factor, list(lower = tolower)) head(Cars93.lower$Type)
## [1] Small Midsize Compact Midsize Midsize Midsize ## Levels: Compact Large Midsize Small Sporty Van
head(Cars93.lower$Type_lower)
## [1] "small" "midsize" "compact" "midsize" "midsize" "midsize"
Let’s convert from MPG to KPML but this time using mutate_at
Cars93.metric <- Cars93 %>% mutate_at(c("MPG.city", "MPG.highway"), list(KMPL = ~ 0.425 * .x)) tail(colnames(Cars93.metric))
## [1] "Luggage.room" "Weight" "Origin" ## [4] "Make" "MPG.city_KMPL" "MPG.highway_KMPL"
Here, ~ 0.425 * .x
is an example of specifying a “lambda” (anonymous) function. It is permitted short-hand for
function(.x){0.425 * .x}
Let’s get the mean of every numeric column in Cars93
Cars93 %>% summarize_if(is.numeric, mean)
## Min.Price Price Max.Price MPG.city MPG.highway EngineSize Horsepower ## 1 17.12581 19.50968 21.89892 22.36559 29.08602 2.667742 143.828 ## RPM Rev.per.mile Fuel.tank.capacity Passengers Length Wheelbase ## 1 5280.645 2332.204 16.66452 5.086022 183.2043 103.9462 ## Width Turn.circle Rear.seat.room Luggage.room Weight ## 1 69.37634 38.95699 NA NA 3072.903
Cars93 %>% summarize_if(is.numeric, list(mean = mean), na.rm=TRUE)
## Min.Price_mean Price_mean Max.Price_mean MPG.city_mean MPG.highway_mean ## 1 17.12581 19.50968 21.89892 22.36559 29.08602 ## EngineSize_mean Horsepower_mean RPM_mean Rev.per.mile_mean ## 1 2.667742 143.828 5280.645 2332.204 ## Fuel.tank.capacity_mean Passengers_mean Length_mean Wheelbase_mean ## 1 16.66452 5.086022 183.2043 103.9462 ## Width_mean Turn.circle_mean Rear.seat.room_mean Luggage.room_mean ## 1 69.37634 38.95699 27.82967 13.89024 ## Weight_mean ## 1 3072.903
Let’s get the average fuel economy of all vehicles, grouped by their Type
Cars93 %>% group_by(Type) %>% summarize_at(c("MPG.city", "MPG.highway"), mean)
## # A tibble: 6 x 3 ## Type MPG.city MPG.highway ## <fct> <dbl> <dbl> ## 1 Compact 22.7 29.9 ## 2 Large 18.4 26.7 ## 3 Midsize 19.5 26.7 ## 4 Small 29.9 35.5 ## 5 Sporty 21.8 28.8 ## 6 Van 17 21.9
We’ll learn about a bunch of select helper functions like contains()
and starts_with()
.
Here’s one way of performing the previous operation with the help of these functions, and appending _mean
to the resulting output.
Cars93 %>% group_by(Type) %>% summarize_at(vars(contains("MPG")), list(mean = mean))
## # A tibble: 6 x 3 ## Type MPG.city_mean MPG.highway_mean ## <fct> <dbl> <dbl> ## 1 Compact 22.7 29.9 ## 2 Large 18.4 26.7 ## 3 Midsize 19.5 26.7 ## 4 Small 29.9 35.5 ## 5 Sporty 21.8 28.8 ## 6 Van 17 21.9
Cars93 %>% group_by(Origin, AirBags) %>% summarize_at(vars(contains("MPG")), list(mean = mean))
## # A tibble: 6 x 4 ## # Groups: Origin [2] ## Origin AirBags MPG.city_mean MPG.highway_mean ## <fct> <fct> <dbl> <dbl> ## 1 USA Driver & Passenger 19 27.2 ## 2 USA Driver only 20.2 27.5 ## 3 USA None 23.1 29.6 ## 4 non-USA Driver & Passenger 20.3 27 ## 5 non-USA Driver only 23.2 29.4 ## 6 non-USA None 25.9 32