Advanced Skills Workshop

Workshop overview:

For loops
Nested for loops
Break, 5-10 mins
Conditional (if/else) statements
Custom functions
Additional practice problem

1 GETTING STARTED

#install.packages('openintro')
#install.packages('dplyr')
#install.packages('gapminder')
#install.packages('ggplot2')
#install.packages('viridis)

library(openintro)

## Loading required package: airports

## Loading required package: cherryblossom

## Loading required package: usdata

library(dplyr)

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
##
##     filter, lag

## The following objects are masked from 'package:base':
##
##     intersect, setdiff, setequal, union

library(gapminder)
library(ggplot2)
library(viridis)

## Loading required package: viridisLite

We will be using three datasets in this workshop:

‘countries’ from the gapminder package - Macroeconomic (e.g. GDP per capita) and demographic (e.g. population) data for every country in the world, by year

countries <- gapminder # create a data frame
head(countries) #see the first six rows of the dataframe

## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

str(countries) #look at the structure of the dataframe

## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

names(countries) #variables in the dataset

## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"

‘storms’ from the dplyr package - Location and weather data for hurricanes in the US

head(storms)

## # A tibble: 6 × 13
##   name   year month   day  hour   lat  long status categ…¹  wind press…² tropi…³
##   <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>  <ord>   <int>   <int>   <int>
## 1 Amy    1975     6    27     0  27.5 -79   tropi… -1         25    1013      NA
## 2 Amy    1975     6    27     6  28.5 -79   tropi… -1         25    1013      NA
## 3 Amy    1975     6    27    12  29.5 -79   tropi… -1         25    1013      NA
## 4 Amy    1975     6    27    18  30.5 -79   tropi… -1         25    1013      NA
## 5 Amy    1975     6    28     0  31.5 -78.8 tropi… -1         25    1012      NA
## 6 Amy    1975     6    28     6  32.4 -78.7 tropi… -1         25    1012      NA
## # … with 1 more variable: hurricane_force_diameter <int>, and abbreviated
## #   variable names ¹category, ²pressure, ³tropicalstorm_force_diameter

‘midterms’ from the openintro package - midterm election results by year, party, and unemployment rate

midterms <- midterms_house
head(midterms)

## # A tibble: 6 × 5
##    year potus               party      unemp house_change
##   <int> <fct>               <fct>      <dbl>        <dbl>
## 1  1899 William McKinley    Republican 11.6         -9.22
## 2  1903 Theodore Roosevelt  Republican  4.3         -4.28
## 3  1907 Theodore Roosevelt  Republican  3.29       -12.3
## 4  1911 William Howard Taft Republican  5.86       -26.6
## 5  1915 Woodrow Wilson      Democrat    6.63       -21.0
## 6  1919 Woodrow Wilson      Democrat    3.38       -10.3

?midterms_house #look at dataset documentation

## starting httpd help server ... done

2 FOR LOOPS

2.1 Basic structure

for (i in vector/list/matrix/dataframe){ some operation with i }

one step in a loop = one ‘iteration’

numeric vector example:

#a simple loop:

numbers <- 1:10 #creating a vector of numbers 1 through 10

for (i in numbers){
  print(i)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

for (i in numbers){
  print(i)
  Sys.sleep(0.5) #this staggers each output by 0.5 seconds
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

#in this illustrative example, you need to include print, or the loop will run without producing output.

numeric vector example:

other_numbers <- c(4,7,5,2,2,3,1)

for (i in other_numbers){
  print(i) #i is looped through in the given order
}

## [1] 4
## [1] 7
## [1] 5
## [1] 2
## [1] 2
## [1] 3
## [1] 1

character vector example:

letters #displaying built-in alphabet vector

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
## [20] "t" "u" "v" "w" "x" "y" "z"

for (i in letters){
  print(i)
}

## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
## [1] "e"
## [1] "f"
## [1] "g"
## [1] "h"
## [1] "i"
## [1] "j"
## [1] "k"
## [1] "l"
## [1] "m"
## [1] "n"
## [1] "o"
## [1] "p"
## [1] "q"
## [1] "r"
## [1] "s"
## [1] "t"
## [1] "u"
## [1] "v"
## [1] "w"
## [1] "x"
## [1] "y"
## [1] "z"

matrix <- matrix(nrow = 3, ncol = 5, c(1:15))
matrix

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    4    7   10   13
## [2,]    2    5    8   11   14
## [3,]    3    6    9   12   15

for (i in matrix){
  print(i)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15

dataframe example:

###looping through the unique political parties in our midterms dataset
for (i in unique(midterms$party)){
  print(i)
}

## [1] "Republican"
## [1] "Democrat"

#note i is an object and never in quotes. i is commonly-used
#but arbitrary. You can make it more meaningful to you if thats easier, e.g.:

for (Parties in unique(midterms$party)){
  print(Parties)
}

## [1] "Republican"
## [1] "Democrat"

Applying an operation by looping through the parties in the midterms dataset

###finding the maximum value of house_change for each party
#if we wanted to do this for ONE party, e.g. Republican

max(midterms$house_change[midterms$party == 'Republican'])

## [1] 3.61991

#uses square brackets to subset, will be using this throughout the workshop

individual components we need to build our for loop:

unique(midterms$party)
max(midterms$house_change[midterms$party == Party])
for loop struture:

for (i in vector){ operation }

#building this into a for loop, for all parties
for (i in unique(midterms$party)){
  x <- max(midterms$house_change[midterms$party == i])
  print(x)
}

## [1] 3.61991
## [1] 2.875399

#can save as an object or embed max within print , i.e
#print(max(countries$gdpPercap[countries$country == i]))

2.1.1 PRACTICE TOGETHER

Another example, with a different dataset: finding the average latitude for each storm in the dataset

individual components we need to build our for loop:

unique(storms$name)
mean(storms$lat[storms$name == Storm name])
for loop struture:

for (i in vector){ operation }

#storms dataset, from dplyr
head(storms)

## # A tibble: 6 × 13
##   name   year month   day  hour   lat  long status categ…¹  wind press…² tropi…³
##   <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>  <ord>   <int>   <int>   <int>
## 1 Amy    1975     6    27     0  27.5 -79   tropi… -1         25    1013      NA
## 2 Amy    1975     6    27     6  28.5 -79   tropi… -1         25    1013      NA
## 3 Amy    1975     6    27    12  29.5 -79   tropi… -1         25    1013      NA
## 4 Amy    1975     6    27    18  30.5 -79   tropi… -1         25    1013      NA
## 5 Amy    1975     6    28     0  31.5 -78.8 tropi… -1         25    1012      NA
## 6 Amy    1975     6    28     6  32.4 -78.7 tropi… -1         25    1012      NA
## # … with 1 more variable: hurricane_force_diameter <int>, and abbreviated
## #   variable names ¹category, ²pressure, ³tropicalstorm_force_diameter

#finding the average latitude for each storm in the dataset

PRACTICE:

Using the ‘countries’ dataset, use a for loop to find the maximum value of GDP per capita (gdpPercap) for each country

#answer:

2.2 Saving the output of your for loop:

lets create a new dataset called ’country_sum, of summary variables by country, with country as one column and max gdp per capita as another

#creating empty vectors for the values we want to store
country <- 0
maxgdp <- 0

#to store values in a vector, we need to create a count, to add values to a vector as we move through each iteration of the loop
index <- 0

#vector[x] points to position x in the vector

for (i in unique(countries$country)){
  maxgdp[index] <- max(countries$gdpPercap[countries$country == i])
    #create vector of max gdpPercap values by country
  country[index] <- i #create vectors of countries
  index <- index + 1 #add one to the index to avoid replacement
  print(index) #print index at each iteration (not required)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100
## [1] 101
## [1] 102
## [1] 103
## [1] 104
## [1] 105
## [1] 106
## [1] 107
## [1] 108
## [1] 109
## [1] 110
## [1] 111
## [1] 112
## [1] 113
## [1] 114
## [1] 115
## [1] 116
## [1] 117
## [1] 118
## [1] 119
## [1] 120
## [1] 121
## [1] 122
## [1] 123
## [1] 124
## [1] 125
## [1] 126
## [1] 127
## [1] 128
## [1] 129
## [1] 130
## [1] 131
## [1] 132
## [1] 133
## [1] 134
## [1] 135
## [1] 136
## [1] 137
## [1] 138
## [1] 139
## [1] 140
## [1] 141
## [1] 142

#look at our new vectors
head(maxgdp)

## [1]  5937.030  6223.367  5522.776 12779.380 34435.367 36126.493

head(country)

## [1] "Albania"   "Algeria"   "Angola"    "Argentina" "Australia" "Austria"

#combine vectors into a dataframe
country_sum <- data.frame(country, maxgdp)
head(country_sum)

##     country    maxgdp
## 1   Albania  5937.030
## 2   Algeria  6223.367
## 3    Angola  5522.776
## 4 Argentina 12779.380
## 5 Australia 34435.367
## 6   Austria 36126.493

#there are other ways of doing this, by creating a dataframe and using that directly in the for-loop, but I think this format is more beginner-friendly

2.2.1 PRACTICE:

modify the code below to create a new dataframe with two columns: Party, and max midterms house change (x in the example code). Name your columns however you’d like.

#original code:
for (i in unique(midterms$party)){
  x <- max(midterms$house_change[midterms$party == i])
  print(x)
}

## [1] 3.61991
## [1] 2.875399

#answer:

More complicated operations in looping

#add year

year <- 0
country <- 0
maxgdp <- 0

index <- 0

for (i in unique(countries$country)){
  maxgdp[index] <- max(countries$gdpPercap[countries$country == i])
  country[index] <- i #same as previous example up to here
  year[index] <- countries$year[countries$gdpPercap == maxgdp[index]]
  index <- index + 1 #add one to the index to avoid replacement
}

country_sum <- data.frame(country, maxgdp, year)
head(country_sum)

##     country    maxgdp year
## 1   Albania  5937.030 2007
## 2   Algeria  6223.367 2007
## 3    Angola  5522.776 1967
## 4 Argentina 12779.380 2007
## 5 Australia 34435.367 2007
## 6   Austria 36126.493 2007

2.3 NESTED FOR LOOPS

nice analogies for nested looping:

like going through words in a book, by pages
like going through values in rows of a matrix, by columns

#conceptual example:

page1 <- c("I", "wrote", "this", "book")
page2 <- c("I", "think", "its", "pretty", "good")
page3 <- c("The", "end")

mybook <-list(page1, page2, page3)

for (i in 1:length(mybook)){
  print(i)
  for (j in mybook[[i]]){
    print(j)
  }
}

## [1] 1
## [1] "I"
## [1] "wrote"
## [1] "this"
## [1] "book"
## [1] 2
## [1] "I"
## [1] "think"
## [1] "its"
## [1] "pretty"
## [1] "good"
## [1] 3
## [1] "The"
## [1] "end"

example with the storms dataset

head(storms)

## # A tibble: 6 × 13
##   name   year month   day  hour   lat  long status categ…¹  wind press…² tropi…³
##   <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>  <ord>   <int>   <int>   <int>
## 1 Amy    1975     6    27     0  27.5 -79   tropi… -1         25    1013      NA
## 2 Amy    1975     6    27     6  28.5 -79   tropi… -1         25    1013      NA
## 3 Amy    1975     6    27    12  29.5 -79   tropi… -1         25    1013      NA
## 4 Amy    1975     6    27    18  30.5 -79   tropi… -1         25    1013      NA
## 5 Amy    1975     6    28     0  31.5 -78.8 tropi… -1         25    1012      NA
## 6 Amy    1975     6    28     6  32.4 -78.7 tropi… -1         25    1012      NA
## # … with 1 more variable: hurricane_force_diameter <int>, and abbreviated
## #   variable names ¹category, ²pressure, ³tropicalstorm_force_diameter

#Lets create a new dataframe, where we summarize the total number of days of storms, by 'status' (type of storm), for each year of data

#creating empty vectors for our desired variables

days <- 0
year <- 0
status <- 0

index <-0

for (i in unique(storms$year)){ #looping through years
  for (j in unique(storms$status)){ #looping through storm status
    days[index] <- nrow(storms[storms$year == i & storms$status == j,])/4 #counting the number of rows of data in each year, for each status to get to the number of days
#have to divide by four, because there are four entries per day
    year[index] <- i
    status[index] <- j
    index <- index + 1
  }
}

storm_data <- data.frame(year, status, days)
head(storm_data)

##   year              status days
## 1 1975      tropical storm 8.25
## 2 1975           hurricane 5.75
## 3 1976 tropical depression 2.50
## 4 1976      tropical storm 5.00
## 5 1976           hurricane 5.50
## 6 1977 tropical depression 4.00

working with your new dataset:

#can plot your data

ggplot(storm_data, aes(x = year, y = days, col = status)) +
  geom_point() +
  geom_smooth(aes(col=status),method="lm", se = FALSE)+
  #change colours
  scale_colour_viridis(discrete = TRUE) +
  #rotate axis text, modifying theme
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#or analyze your data

model_list<-list() #creating empty list

#create a list of models
for (i in unique(storm_data$status)){
  model_list[[i]] <- summary(lm(days ~ year, data = storm_data[storm_data$status == i,]))
}

model_list

## $`tropical storm`
##
## Call:
## lm(formula = days ~ year, data = storm_data[storm_data$status ==
##     i, ])
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -33.419  -7.744  -3.264   7.721  35.360
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1708.5688   305.0934  -5.600 1.30e-06 ***
## year            0.8699     0.1527   5.696 9.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.75 on 44 degrees of freedom
## Multiple R-squared:  0.4244, Adjusted R-squared:  0.4113
## F-statistic: 32.44 on 1 and 44 DF,  p-value: 9.438e-07
##
##
## $hurricane
##
## Call:
## lm(formula = days ~ year, data = storm_data[storm_data$status ==
##     i, ])
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -22.312  -9.625  -2.445   5.309  43.820
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -744.1014   284.5643  -2.615   0.0122 *
## year           0.3823     0.1425   2.684   0.0102 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.83 on 44 degrees of freedom
## Multiple R-squared:  0.1407, Adjusted R-squared:  0.1212
## F-statistic: 7.204 on 1 and 44 DF,  p-value: 0.01022
##
##
## $`tropical depression`
##
## Call:
## lm(formula = days ~ year, data = storm_data[storm_data$status ==
##     i, ])
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -11.770  -5.905  -2.326   5.128  22.661
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -213.41548  193.18900  -1.105    0.275
## year           0.11479    0.09669   1.187    0.242
##
## Residual standard error: 8.424 on 43 degrees of freedom
## Multiple R-squared:  0.03174,    Adjusted R-squared:  0.00922
## F-statistic: 1.409 on 1 and 43 DF,  p-value: 0.2417

#or create individual objects for each model based on i
for (i in unique(storm_data$status)){
  assign(paste0("model_", i), summary(lm(days ~ year, data = storm_data[storm_data$status == i,])))
}

model_hurricane

##
## Call:
## lm(formula = days ~ year, data = storm_data[storm_data$status ==
##     i, ])
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -22.312  -9.625  -2.445   5.309  43.820
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -744.1014   284.5643  -2.615   0.0122 *
## year           0.3823     0.1425   2.684   0.0102 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.83 on 44 degrees of freedom
## Multiple R-squared:  0.1407, Adjusted R-squared:  0.1212
## F-statistic: 7.204 on 1 and 44 DF,  p-value: 0.01022

BREAK

3 IF/ELSE STATEMENTS

3.1 Basic structure

if (logical statement){ operation true}

can be expanded to an if/else statement:

if (logical statement){ operation if true} else{operation if false}

examples of logical statements:

x > 5 x == 4 x == “cat” x != 0

Simple example:

x = 6 #set x to 6

if (x > 5){
  print("x is greater than 5")
}

## [1] "x is greater than 5"

x = 4 #set x to 4

#add an else statement
if (x > 5){
  print("x is greater than 5")
} else {
  print("x is less than 5")
}

## [1] "x is less than 5"

if/else statement become very useful when embedded within a for loop:

#e.g., create a binary variable for Republican (1) and Democrat (0)

midterms$binary <- NA

for (i in 1:nrow(midterms)){
if(midterms$party[i] == 'Republican'){
  midterms$binary[i] = 1} else {
    midterms$binary[i] = 0
  }
}

head(midterms)

## # A tibble: 6 × 6
##    year potus               party      unemp house_change binary
##   <int> <fct>               <fct>      <dbl>        <dbl>  <dbl>
## 1  1899 William McKinley    Republican 11.6         -9.22      1
## 2  1903 Theodore Roosevelt  Republican  4.3         -4.28      1
## 3  1907 Theodore Roosevelt  Republican  3.29       -12.3       1
## 4  1911 William Howard Taft Republican  5.86       -26.6       1
## 5  1915 Woodrow Wilson      Democrat    6.63       -21.0       0
## 6  1919 Woodrow Wilson      Democrat    3.38       -10.3       0

4 CUSTOM FUNCTIONS

4.1 Basic structure

my_function <- function(argument1, argument2, …) { some operation using arguments return(desired output) }

for example, building a ‘multiplier’ function:

multiplier <- function(n1, n2){
  output <- n1*n2
  return(output)
}

#or

multiplier <- function(n1, n2){
  return(n1*n2)
}

multiplier(2,5)

## [1] 10

4.1.1 PRACTICE (5-10 mins):

build a function (called ‘my_mean’) that calculates the mean of two numbers (n1 and n2), without using the mean function

#answer:

if we wanted to expand this, to work on an entire vector:

vec <- c(1,3,4,7,4,3) #creating new vector

my_mean <- function(vec){
  return((sum(vec))/length(vec))
}

my_mean(vec) #runnning our function on our vector

## [1] 3.666667

mean(vec) #checking it worked

## [1] 3.666667

more complicated example, building a standard deviation function:

standard deviation = sqrt(sum((x - mean)^2)/N)

#to do this OUTSIDE a custom function

numbers<-c(2,4,5,1,3,4)

x <- 0
index <- 1

for (i in numbers){
  x[index] <- (i - mean(numbers))^2
  index <- index + 1
}

stddev <- sqrt(sum(x)/length(numbers))
stddev

## [1] 1.34371

if we wanted to repeat our calculation for many different vectors, we could create a custom function with a for loop

standard.dev <- function(values){ #providing our function argument
x <- 0 #creating and empty vector
index <- 1 #creating an index for storing values
for (i in values){
  x[index] <- (i - mean(values))^2
  index <- index + 1
}
stddev <- sqrt(sum(x)/length(values)) #note output goes outside
return(stddev) #the for loop
}

standard.dev(values = midterms$unemp)

## [1] 3.899379

#or
standard.dev(midterms$unemp)

## [1] 3.899379

4.1.2 EXTRA CHALLENGE

modify your mean function to calculate the mean of each vector in the list provided (note to grab one vector from a list, you would use [[ ]], eg list[[2]] would grab the second vector in a list)

vec1 <- c(1,2)
vec2 <- c(4,5,6)
vec3 <- c(7,8,9,10)

vec_list <- list(vec1, vec2, vec3)

my_mean_list <- function(list1){
  output <- 0
    for (i in 1:length(list1)){
    output[i] <- (sum(list1[[i]])/length(list1[[i]]))}
    return(output)
}

my_mean_list(vec_list)

## [1] 1.5 5.0 8.5