x <- 1; y <- 2
if(x>y){
print("x is larger than y")
}else if(x<y){
print("x is smaller than y")
}else{
print("x is equal to y")
}
#> [1] "x is smaller than y"
Conditional actions in R can be determined through the usual if then else
statements:
x <- 1; y <- 2
if(x>y){
print("x is larger than y")
}else if(x<y){
print("x is smaller than y")
}else{
print("x is equal to y")
}
#> [1] "x is smaller than y"
Sometimes, it’s usefull to be able to do this in one line using ifelse(test, yes, no)
:
x <- 3:7
ifelse(x>5, "larger than 5", "lower than 5")
#> [1] "lower than 5" "lower than 5" "lower than 5" "larger than 5"
#> [5] "larger than 5"
Loops in R are provided through the usual for
and while
keywords:
However, since R is a vectorized language, it means that loops are to be avoided when possible because they are very inefficient:
forloop <- function(x){
for(i in seq_along(x)){
x[i] <- 2*x[i]
}
x
}
noforloop <- function(x){
2*x
}
x <- runif(1e7)
microbenchmark::microbenchmark(
forloop = forloop(x),
noforloop = noforloop(x),
times = 10L
)
#> Unit: milliseconds
#> expr min lq mean median uq max
#> forloop 231.46361 231.82663 235.123852 233.184200 235.693133 250.000534
#> noforloop 1.46165 2.23409 4.998228 6.297702 6.443765 6.658195
#> neval cld
#> 10 a
#> 10 b
apply
familyAvoiding loops should therefore be sought for when possible. R helps us in this way through the base functions apply()
, sapply()
and lapply()
. Operations in the tidyverse
are also a very good way of avoiding loops.
Take a look at the help on these functions, but the summary is that apply(df, direction, function)
applies a function in the wanted direction (1 for rows, 2 for columns) of the given data.frame
(or vector). Example:
#> # A tibble: 5 × 3
#> x y z
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 4 8
#> 3 3 9 27
#> 4 4 16 64
#> 5 5 25 125
apply(dt, 1, mean) # mean of the rows
#> [1] 1.000000 4.666667 13.000000 28.000000 51.666667
apply(dt, 2, mean) # mean of the columns
#> x y z
#> 3 11 45
# row/column means
rowMeans(dt)
#> [1] 1.000000 4.666667 13.000000 28.000000 51.666667
colMeans(dt)
#> x y z
#> 3 11 45
lapply()
(and equivalently, sapply()
) is basically the same thing but applied to lists and it returns a list (a vector):
my_list <- list(dt/3, dt/5);my_list
#> [[1]]
#> x y z
#> 1 0.3333333 0.3333333 0.3333333
#> 2 0.6666667 1.3333333 2.6666667
#> 3 1.0000000 3.0000000 9.0000000
#> 4 1.3333333 5.3333333 21.3333333
#> 5 1.6666667 8.3333333 41.6666667
#>
#> [[2]]
#> x y z
#> 1 0.2 0.2 0.2
#> 2 0.4 0.8 1.6
#> 3 0.6 1.8 5.4
#> 4 0.8 3.2 12.8
#> 5 1.0 5.0 25.0
lapply(my_list, "[", 1, ) # print first row
#> [[1]]
#> x y z
#> 1 0.3333333 0.3333333 0.3333333
#>
#> [[2]]
#> x y z
#> 1 0.2 0.2 0.2
sapply(my_list, rowSums) # sum on rows
#> [,1] [,2]
#> [1,] 1.000000 0.6
#> [2,] 4.666667 2.8
#> [3,] 13.000000 7.8
#> [4,] 28.000000 16.8
#> [5,] 51.666667 31.0
lapply(my_list, round, 1) # round to first decimal
#> [[1]]
#> x y z
#> 1 0.3 0.3 0.3
#> 2 0.7 1.3 2.7
#> 3 1.0 3.0 9.0
#> 4 1.3 5.3 21.3
#> 5 1.7 8.3 41.7
#>
#> [[2]]
#> x y z
#> 1 0.2 0.2 0.2
#> 2 0.4 0.8 1.6
#> 3 0.6 1.8 5.4
#> 4 0.8 3.2 12.8
#> 5 1.0 5.0 25.0
# For more complex operations, use it this way:
sapply(1:nrow(dt), function(i){
dt$x[i] + dt$y[(i+2)%%nrow(dt)+1] - dt$z[(i+4)%%nrow(dt)+1]
})
#> [1] 16 19 -23 -56 -111
tidyverse
wayThe package tidyverse
offers numerous ways to avoid explicit for
loops. To see how to do this, refer to the section on operations in the tidyverse
.
Given x <- runif(1e3, min=-1, max=1)
, create a tibble
like this one:
#> # A tibble: 1,000 × 2
#> x y
#> <dbl> <chr>
#> 1 -0.445 x<=0
#> 2 -0.910 x<=0
#> 3 0.278 x>0
#> 4 -0.971 x<=0
#> 5 -0.324 x<=0
#> 6 -0.723 x<=0
#> 7 0.943 x>0
#> 8 -0.885 x<=0
#> 9 0.960 x>0
#> 10 -0.0992 x<=0
#> # ℹ 990 more rows
Given:
Print the sum of each element of LL
in a list, in a vector.
data.frame
# Download population.csv and load it into a `data.frame`
df <- read.csv("Data/population.csv")
# What is the total population over the years?
data.frame(year=df[,"year"],
pop =rowSums(df[,-1]), # a first way
pop2=apply(df[,-1], 1, sum)# another way
)
#> year pop pop2
#> 1 1962 7349547 7349547
#> 2 1968 7550999 7550999
#> 3 1975 7298183 7298183
#> 4 1982 6933230 6933230
#> 5 1990 6857291 6857291
#> 6 1999 6965325 6965325
#> 7 2007 7235114 7235114
#> 8 2012 7334805 7334805
# A tidy-compatible version
library(tidyverse)
popul <- pivot_longer(df, cols=-year, names_to="city", values_to="pop")
popul |>
group_by(year) |>
summarise(totpop = sum(pop))
#> # A tibble: 8 × 2
#> year totpop
#> <int> <int>
#> 1 1962 7349547
#> 2 1968 7550999
#> 3 1975 7298183
#> 4 1982 6933230
#> 5 1990 6857291
#> 6 1999 6965325
#> 7 2007 7235114
#> 8 2012 7334805
#> # A tibble: 8 × 2
#> year totpop
#> <int> <int>
#> 1 1962 7349547
#> 2 1968 7550999
#> 3 1975 7298183
#> 4 1982 6933230
#> 5 1990 6857291
#> 6 1999 6965325
#> 7 2007 7235114
#> 8 2012 7334805
# What is the mean population for each city?
apply(df[,-1], 2, mean)
#> Angers Bordeaux Brest Dijon Grenoble
#> 138783.4 234814.9 149125.1 146735.2 157526.4
#> LeHavre LeMans Lille Lyon Marseille
#> 193989.6 144347.4 220018.4 470371.1 844253.4
#> Montpellier Nantes Nice Paris Reims
#> 203114.4 260924.9 334311.6 2321031.9 172278.0
#> Rennes Saint.Etienne Strasbourg Toulon Toulouse
#> 193424.9 198134.6 255429.1 169682.6 382264.9
#> # A tibble: 20 × 2
#> city avepop
#> <chr> <dbl>
#> 1 Angers 138783.
#> 2 Bordeaux 234815.
#> 3 Brest 149125.
#> 4 Dijon 146735.
#> 5 Grenoble 157526.
#> 6 LeHavre 193990.
#> 7 LeMans 144347.
#> 8 Lille 220018.
#> 9 Lyon 470371.
#> 10 Marseille 844253.
#> 11 Montpellier 203114.
#> 12 Nantes 260925.
#> 13 Nice 334312.
#> 14 Paris 2321032.
#> 15 Reims 172278
#> 16 Rennes 193425.
#> 17 Saint.Etienne 198135.
#> 18 Strasbourg 255429.
#> 19 Toulon 169683.
#> 20 Toulouse 382265.