10  Conditional actions and loops

10.1 Conditional actions

Conditional actions in R can be determined through the usual if then else statements:

x <- 1; y <- 2
if(x>y){
    print("x is larger than y")
}else if(x<y){
    print("x is smaller than y")
}else{
    print("x is equal to y")
}
#> [1] "x is smaller than y"

Sometimes, it’s usefull to be able to do this in one line using ifelse(test, yes, no):

x <- 3:7
ifelse(x>5, "larger than 5", "lower than 5") 
#> [1] "lower than 5"  "lower than 5"  "lower than 5"  "larger than 5"
#> [5] "larger than 5"

10.2 Loops…

Loops in R are provided through the usual for and while keywords:

# For loop
for(i in 1:100){
    # pass to next index directly
    if(i %in% c(3,8,5)) next 
    # break loop
    if(i==10) break
    print(i)
}
#> [1] 1
#> [1] 2
#> [1] 4
#> [1] 6
#> [1] 7
#> [1] 9
phrase <- c("hello", "world")
for(word in phrase){
    print(word)
}
#> [1] "hello"
#> [1] "world"
# While loop
i <- 1
while(i<8){
    print(i)
    i <- i+2
}
#> [1] 1
#> [1] 3
#> [1] 5
#> [1] 7

10.3 … and how to avoid them

However, since R is a vectorized language, it means that loops are to be avoided when possible because they are very inefficient:

forloop <- function(x){
    for(i in seq_along(x)){
        x[i] <- 2*x[i]
    }
    x
}
noforloop <- function(x){
    2*x
}
x <- runif(1e7)
microbenchmark::microbenchmark(
    forloop   = forloop(x),
    noforloop = noforloop(x),
    times = 10L
)
#> Unit: milliseconds
#>       expr        min         lq       mean    median         uq       max
#>    forloop 218.339104 221.693109 225.139569 225.41855 227.184116 233.19697
#>  noforloop   1.402651   2.544378   6.288494   4.76176   7.762776  22.84167
#>  neval cld
#>     10  a 
#>     10   b

10.3.1 The apply family

Avoiding loops should therefore be sought for when possible. R helps us in this way through the base functions apply(), sapply() and lapply(). Operations in the tidyverse are also a very good way of avoiding loops.

Take a look at the help on these functions, but the summary is that apply(df, direction, function) applies a function in the wanted direction (1 for rows, 2 for columns) of the given data.frame (or vector). Example:

library(tibble)
dt <- tibble(x=1:5, y=x^2, z=x^3);dt
#> # A tibble: 5 × 3
#>       x     y     z
#>   <int> <dbl> <dbl>
#> 1     1     1     1
#> 2     2     4     8
#> 3     3     9    27
#> 4     4    16    64
#> 5     5    25   125
apply(dt, 1, mean) # mean of the rows
#> [1]  1.000000  4.666667 13.000000 28.000000 51.666667
apply(dt, 2, mean) # mean of the columns
#>  x  y  z 
#>  3 11 45
# row/column means
rowMeans(dt)
#> [1]  1.000000  4.666667 13.000000 28.000000 51.666667
#>  x  y  z 
#>  3 11 45

lapply() (and equivalently, sapply()) is basically the same thing but applied to lists and it returns a list (a vector):

my_list <- list(dt/3, dt/5);my_list
#> [[1]]
#>           x         y          z
#> 1 0.3333333 0.3333333  0.3333333
#> 2 0.6666667 1.3333333  2.6666667
#> 3 1.0000000 3.0000000  9.0000000
#> 4 1.3333333 5.3333333 21.3333333
#> 5 1.6666667 8.3333333 41.6666667
#> 
#> [[2]]
#>     x   y    z
#> 1 0.2 0.2  0.2
#> 2 0.4 0.8  1.6
#> 3 0.6 1.8  5.4
#> 4 0.8 3.2 12.8
#> 5 1.0 5.0 25.0
lapply(my_list, "[", 1, )  # print first row
#> [[1]]
#>           x         y         z
#> 1 0.3333333 0.3333333 0.3333333
#> 
#> [[2]]
#>     x   y   z
#> 1 0.2 0.2 0.2
sapply(my_list, rowSums)   # sum on rows
#>           [,1] [,2]
#> [1,]  1.000000  0.6
#> [2,]  4.666667  2.8
#> [3,] 13.000000  7.8
#> [4,] 28.000000 16.8
#> [5,] 51.666667 31.0
lapply(my_list, round, 1)  # round to first decimal
#> [[1]]
#>     x   y    z
#> 1 0.3 0.3  0.3
#> 2 0.7 1.3  2.7
#> 3 1.0 3.0  9.0
#> 4 1.3 5.3 21.3
#> 5 1.7 8.3 41.7
#> 
#> [[2]]
#>     x   y    z
#> 1 0.2 0.2  0.2
#> 2 0.4 0.8  1.6
#> 3 0.6 1.8  5.4
#> 4 0.8 3.2 12.8
#> 5 1.0 5.0 25.0
# For more complex operations, use it this way:
sapply(1:nrow(dt), function(i){
    dt$x[i] + dt$y[(i+2)%%nrow(dt)+1] - dt$z[(i+4)%%nrow(dt)+1]
})
#> [1]   16   19  -23  -56 -111

10.3.2 The tidyverse way

The package tidyverse offers numerous ways to avoid explicit for loops. To see how to do this, refer to the section on operations in the tidyverse.

10.4 Exercises

Exercise 1

Given x <- runif(1e3, min=-1, max=1), create a tibble like this one:

#> # A tibble: 1,000 × 2
#>         x y    
#>     <dbl> <chr>
#>  1 -0.235 x<=0 
#>  2  0.366 x>0  
#>  3  0.272 x>0  
#>  4  0.838 x>0  
#>  5  0.564 x>0  
#>  6 -0.826 x<=0 
#>  7 -0.557 x<=0 
#>  8  0.859 x>0  
#>  9 -0.607 x<=0 
#> 10  0.347 x>0  
#> # ℹ 990 more rows
Exercise 2

Given:

LL <- list(A = runif(1e2),
           B = rnorm(1e3),
           C = data.frame(x=runif(1e2), y=runif(1e2))
           )

Print the sum of each element of LL in a list, in a vector.

Solution
LL <- list(A = runif(1e2),
           B = rnorm(1e3),
           C = data.frame(x=runif(1e2), y=runif(1e2))
           )
lapply(LL, sum)
#> $A
#> [1] 57.13301
#> 
#> $B
#> [1] 46.07109
#> 
#> $C
#> [1] 106.691
unlist(lapply(LL, sum)); sapply(LL, sum)
#>         A         B         C 
#>  57.13301  46.07109 106.69103
#>         A         B         C 
#>  57.13301  46.07109 106.69103
Exercise 3
  • Download population.csv and load it into a data.frame
  • What is the total population over the years?
  • What is the mean population for each city?
Solution
# Download population.csv and load it into a `data.frame`
df <- read.csv("Data/population.csv")
# What is the total population over the years?
data.frame(year=df[,"year"],
           pop =rowSums(df[,-1]),     # a first way
           pop2=apply(df[,-1], 1, sum)# another way
          )
#>   year     pop    pop2
#> 1 1962 7349547 7349547
#> 2 1968 7550999 7550999
#> 3 1975 7298183 7298183
#> 4 1982 6933230 6933230
#> 5 1990 6857291 6857291
#> 6 1999 6965325 6965325
#> 7 2007 7235114 7235114
#> 8 2012 7334805 7334805
# A tidy-compatible version 
library(tidyverse)
popul <- pivot_longer(df, cols=-year, names_to="city", values_to="pop")
popul |> 
  group_by(year) |> 
  summarise(totpop = sum(pop))
#> # A tibble: 8 × 2
#>    year  totpop
#>   <int>   <int>
#> 1  1962 7349547
#> 2  1968 7550999
#> 3  1975 7298183
#> 4  1982 6933230
#> 5  1990 6857291
#> 6  1999 6965325
#> 7  2007 7235114
#> 8  2012 7334805
# or equivalently
summarise(group_by(popul, year), totpop = sum(pop))
#> # A tibble: 8 × 2
#>    year  totpop
#>   <int>   <int>
#> 1  1962 7349547
#> 2  1968 7550999
#> 3  1975 7298183
#> 4  1982 6933230
#> 5  1990 6857291
#> 6  1999 6965325
#> 7  2007 7235114
#> 8  2012 7334805
# What is the mean population for each city?
apply(df[,-1], 2, mean)
#>        Angers      Bordeaux         Brest         Dijon      Grenoble 
#>      138783.4      234814.9      149125.1      146735.2      157526.4 
#>       LeHavre        LeMans         Lille          Lyon     Marseille 
#>      193989.6      144347.4      220018.4      470371.1      844253.4 
#>   Montpellier        Nantes          Nice         Paris         Reims 
#>      203114.4      260924.9      334311.6     2321031.9      172278.0 
#>        Rennes Saint.Etienne    Strasbourg        Toulon      Toulouse 
#>      193424.9      198134.6      255429.1      169682.6      382264.9
popul |> 
  group_by(city) |> 
  summarise(avepop = mean(pop))
#> # A tibble: 20 × 2
#>    city            avepop
#>    <chr>            <dbl>
#>  1 Angers         138783.
#>  2 Bordeaux       234815.
#>  3 Brest          149125.
#>  4 Dijon          146735.
#>  5 Grenoble       157526.
#>  6 LeHavre        193990.
#>  7 LeMans         144347.
#>  8 Lille          220018.
#>  9 Lyon           470371.
#> 10 Marseille      844253.
#> 11 Montpellier    203114.
#> 12 Nantes         260925.
#> 13 Nice           334312.
#> 14 Paris         2321032.
#> 15 Reims          172278 
#> 16 Rennes         193425.
#> 17 Saint.Etienne  198135.
#> 18 Strasbourg     255429.
#> 19 Toulon         169683.
#> 20 Toulouse       382265.