Data handling
The first thing to do is to load and regroup all these datasets into
a single one.
- Load the
tidyverse
library and, using
read_csv()
, load the 4 datasets in 4 separate tibbles
called children
, income
, pop
and
religion
.
library(tidyverse)
library(readxl)
children <- read_csv("Data/children_per_woman_total_fertility.csv")
income <- read_csv("Data/income_per_person_gdppercapita_ppp_inflation_adjusted.csv")
pop <- read_csv("Data/population_total.csv")
religion <- read_csv("Data/religion.csv")
- To reproduce the chart on the video, we need to determine the
dominant religion in each country. In the
religion
dataset,
add a column Religion
that will give the name of the
dominant religion for each country. For this, you need to make the table
contain just the Country
and all religions, make the table
tidy, and then select the religion with the highest proportion for each
country. We will filter the data to get only the year 2020.
religion <- religion |>
filter(Year==2020) |>
select(Country, Buddhists:Unaffiliated) |>
pivot_longer(cols=-Country,
names_to="Religion",
values_to="Proportion") |>
filter(.by = Country,
Proportion==max(Proportion)) |>
select(-Proportion)
- Using
pivot_longer()
, make all datasets tidy.
children
should now contain 3 columns:
Country
, Year
and Fertility
.
income
should now contain 3 columns:
Country
, Year
and Income
.
pop
should now contain 3 columns: Country
,
Year
and Population
.
We will only consider data from 1900 to 2018. Example of syntax using
the pipe operator |>
:
DF <- read_table("name 2010 2011 2012 2014
Kevin 10 11 12 123
Jane 122 56 23 4
"
)
DF
## # A tibble: 2 × 5
## name `2010` `2011` `2012` `2014`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Kevin 10 11 12 123
## 2 Jane 122 56 23 4
DF |>
select(name, '2010':'2012') |>
pivot_longer(col=-name,
names_to="Year",
values_to="Score",
names_transform=list(Year = as.numeric))
## # A tibble: 6 × 3
## name Year Score
## <chr> <dbl> <dbl>
## 1 Kevin 2010 10
## 2 Kevin 2011 11
## 3 Kevin 2012 12
## 4 Jane 2010 122
## 5 Jane 2011 56
## 6 Jane 2012 23
The line names_transform=list(Year = as.numeric)
is here
to convert the character year values to numerical values.
children <- children |>
select(Country, '1900':'2018') |>
pivot_longer(col=-Country,
names_to="Year",
values_to="Fertility",
names_transform=list(Year = as.numeric))
income <- income |>
select(Country, '1900':'2018') |>
pivot_longer(col=-Country,
names_to="Year",
values_to="Income",
names_transform=list(Year = as.numeric))
pop <- pop |>
select(Country, '1900':'2018') |>
pivot_longer(col=-Country,
names_to="Year",
values_to="Population",
names_transform=list(Year = as.numeric))
- Now we want to combine all these datasets into a single one called
dat
, containing the columns Country
,
Year
, Population
, Religion
,
Fertility
and Income
. Look into the
inner_join()
function of the dplyr
library
(which is part of the tidyverse
library).
dat <- inner_join(children, income) |>
inner_join(pop) |>
inner_join(religion)
# write_csv(dat, "Data/dat.csv")
You should end up with a dataset like this one:
## # A tibble: 20,587 × 6
## Country Year Fertility Income Population Religion
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Afghanistan 1900 7 793 5020000 Muslims
## 2 Afghanistan 1901 7 796 5050000 Muslims
## 3 Afghanistan 1902 7 798 5090000 Muslims
## 4 Afghanistan 1903 7 801 5120000 Muslims
## 5 Afghanistan 1904 7 804 5150000 Muslims
## 6 Afghanistan 1905 7 807 5180000 Muslims
## 7 Afghanistan 1906 7 809 5220000 Muslims
## 8 Afghanistan 1907 7 812 5250000 Muslims
## 9 Afghanistan 1908 7 815 5280000 Muslims
## 10 Afghanistan 1909 7 818 5320000 Muslims
## # ℹ 20,577 more rows
In case you struggled to get there, download the archive with the
button at the top and get the dat
tibble with dat <- read_csv("Data/dat.csv")
.
Now our dataset is ready, let’s plot it.
Plotting
- Load the library
ggplot2
and set the global theme to
theme_bw()
using theme_set()
library(ggplot2)
theme_set(theme_bw())
- Create a subset of
dat
concerning your origin country.
For me it will be dat_france
dat_france <- dat |> filter(Country=="France")
- Plot the evolution of the income per capita and the number of
children per woman as a function of the years, and make it look like
that (notice the kinks during the two world wars):
ggplot(data=dat_france, aes(x=Year, y=Income))+
ggtitle("Household income in France")+
xlab("Year")+
ylab("Household income per capita per year [constant $]")+
annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
geom_point(alpha=0.2, size=5)+
geom_smooth()
ggplot(data=dat_france, aes(x=Year, y=Fertility))+
ggtitle("Fertility in France")+
xlab("Year")+
lims(y=c(0,5))+
ylab("Children per woman")+
annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
geom_line(size=2, color="red")
- Create a subset of
dat
containing the data for your
country plus all the neighbor countries (if you come from an island, the
nearest countries…). For me, dat_france_region
will contain
data from France, Spain, Italy, Switzerland, Germany, Luxembourg and
Belgium.
dat_france_region <- dat |>
filter(Country %in% c("France", "Spain", "Italy", "Switzerland", "Germany", "Luxembourg", "Belgium"))
- Plot again income and fertility as a function of the years, but add
a color corresponding to the country and a point size to its
population:
ggplot(data=dat_france_region, aes(x=Year, y=Income, col=Country, size=Population))+
ggtitle("Household income in France and its region")+
xlab("Year")+
ylab("Household income per capita per year [constant $]")+
annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
geom_point(alpha=0.5)
ggplot(data=dat_france_region, aes(x=Year, y=Fertility, col=Country, size=Population))+
ggtitle("Fertility in France and its region")+
xlab("Year")+
lims(y=c(0,5))+
ylab("Children per woman")+
annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
geom_point(alpha=.5)
- Load the library
plotly
and make the previous graphs
interactive. You can make an interactive graph by calling
ggplotly()
, like that:
library(plotly)
P <- ggplot(data = dat_france, aes(x=Population, y=Income))+
geom_point()
ggplotly(P) # add dynamicTicks=TRUE allows redrawing ticks when zooming in
- Finally, you can add a slider to the interactive graph allowing
selecting a value for another variable (just like in the video) by
adding the keyword
frame =
in the chart’s aesthetics. So
now, make the graph of the video ! (you can also add the aesthetics
id=Country
to show the country name in the popup when
hovering on a point).
library(plotly)
P <- ggplot(data=dat, aes(x=Income,
y=Fertility,
frame=Year,
col=Religion,
size=Population,
id=Country))+
geom_point(alpha=0.5)+
ggtitle("Fertility vs. Income in the World")+
xlab("Household income per capita per year [constant $]")+
lims(y=c(0,8))+
scale_size(range=c(1, 15))+
ylab("Children per woman")
ggplotly(P, dynamicTicks = TRUE) |>
layout(yaxis = list(autorange = FALSE),
xaxis = list(autorange = FALSE, type = 'log', range=list(2, 5.5)))
---
title : "R Exercises - Religion and babies"
date  : "`r Sys.Date()`"
output: 
    html_document:
        toc            : true
        toc_float      : true
        toc_depth      : 4
        highlight      : tango
        number_sections: true
        code_download  : true
params: 
    solution:
        value: true
---

```{r echo=FALSE, warning=FALSE, message=FALSE, fig.align="center"}
library(downloadthis)
download_link(
  link = "./Archive.zip",
  output_name = "Data Files",
  button_label = "Download Data Files",
  button_type = "default",
  has_icon = TRUE,
  icon = "fa fa-save",
  self_contained = FALSE
)
```
<br>

----

In these exercises we will see the power of the libraries `ggplot2` and `plotly` to make sense of statistical data. The goal is to reproduce the moving chart that you can see in this video from Hans Rosling -- I invite you to watch his other videos, they are quite enlightening and inspiring:

<div style="text-align: center;">
<a href="https://www.ted.com/talks/hans_rosling_religions_and_babies?language=en" target="_blank">
<i class="fab fa-youtube fa-5x"></i>
</a>
</div>

<br>
<br>

For this, we will need to gather the data:

- From [Gapminder](https://www.gapminder.org/data/), data per country and per year from 1800 to 2018:
    - [The children per woman total fertility](Data/children_per_woman_total_fertility.csv)
    - [The income per capita](Data/income_per_person_gdppercapita_ppp_inflation_adjusted.csv)
    - [The total population](Data/population_total.csv)
- From the [PEW research center](https://www.pewforum.org/2015/04/02/religious-projection-table/2010/percent/all/), data per country:
    + [The religious composition](Data/religion.csv)

------- 

# Data handling

The first thing to do is to load and regroup all these datasets into a single one.

1. Load the `tidyverse` library and, using `read_csv()`, load the 4 datasets in 4 separate tibbles called `children`, `income`, `pop` and `religion`.

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
library(tidyverse)
library(readxl)
children <- read_csv("Data/children_per_woman_total_fertility.csv")
income   <- read_csv("Data/income_per_person_gdppercapita_ppp_inflation_adjusted.csv")
pop      <- read_csv("Data/population_total.csv")
religion <- read_csv("Data/religion.csv")
```

2. To reproduce the chart on the video, we need to determine the dominant religion in each country. In the `religion` dataset, add a column `Religion` that will give the name of the dominant religion for each country. For this, you need to make the table contain just the `Country` and all religions, make the table tidy, and then select the religion with the highest proportion for each country. We will filter the data to get only the year 2020.

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
religion <- religion |>
        filter(Year==2020) |>
        select(Country, Buddhists:Unaffiliated) |> 
        pivot_longer(cols=-Country, 
                     names_to="Religion", 
                     values_to="Proportion") |>
        filter(.by = Country,
               Proportion==max(Proportion)) |>
        select(-Proportion)
```

3. Using `pivot_longer()`, make all datasets tidy. 

- `children` should now contain 3 columns: `Country`, `Year` and `Fertility`. 
- `income` should now contain 3 columns: `Country`, `Year` and `Income`. 
- `pop` should now contain 3 columns: `Country`, `Year` and `Population`. 

We will only consider data from 1900 to 2018. Example of syntax using the pipe operator `|>`:

```{r}
DF <- read_table("name  2010  2011  2012  2014
Kevin  10    11   12   123
Jane   122   56   23   4
"
)
DF
DF |> 
    select(name, '2010':'2012') |> 
    pivot_longer(col=-name,
                 names_to="Year", 
                 values_to="Score",
                 names_transform=list(Year = as.numeric))
```
The line `names_transform=list(Year = as.numeric)` is here to convert the character year values to numerical values.

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
children <- children |> 
                select(Country, '1900':'2018') |> 
                pivot_longer(col=-Country, 
                             names_to="Year", 
                             values_to="Fertility",
                             names_transform=list(Year = as.numeric))
income   <- income |> 
                select(Country, '1900':'2018') |> 
                pivot_longer(col=-Country, 
                             names_to="Year", 
                             values_to="Income",
                             names_transform=list(Year = as.numeric))
pop      <- pop |> 
                select(Country, '1900':'2018') |> 
                pivot_longer(col=-Country, 
                             names_to="Year", 
                             values_to="Population",
                             names_transform=list(Year = as.numeric))
```

4. Now we want to combine all these datasets into a single one called `dat`, containing the columns `Country`, `Year`, `Population`, `Religion`, `Fertility` and `Income`. Look into the `inner_join()` function of the `dplyr` library (which is part of the `tidyverse` library). 

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
dat <- inner_join(children, income) |>
        inner_join(pop) |>
        inner_join(religion)
# write_csv(dat, "Data/dat.csv")
```

You should end up with a dataset like this one:

```{r echo=FALSE, warning = FALSE, message=FALSE, cache=FALSE}
dat
```

In case you struggled to get there, download the archive with the button at the top and get the `dat` tibble with `dat <- read_csv("Data/dat.csv")`{.R}.

Now our dataset is ready, let's plot it.

# Plotting

1. Load the library `ggplot2` and set the global theme to `theme_bw()` using `theme_set()`

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
library(ggplot2)
theme_set(theme_bw())
```

2. Create a subset of `dat` concerning your origin country. For me it will be `dat_france`

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
dat_france <- dat |> filter(Country=="France")
```

3. Plot the evolution of the income per capita and the number of children per woman as a function of the years, and make it look like that (notice the kinks during the two world wars):

```{r echo=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
ggplot(data=dat_france, aes(x=Year, y=Income))+
    ggtitle("Household income in France")+
    xlab("Year")+
    ylab("Household income per capita per year [constant $]")+
    annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
    annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
    geom_point(alpha=0.2, size=5)+
    geom_smooth()
ggplot(data=dat_france, aes(x=Year, y=Fertility))+
    ggtitle("Fertility in France")+
    xlab("Year")+
    lims(y=c(0,5))+
    ylab("Children per woman")+
    annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
    annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
    geom_line(size=2, color="red")
```

4. Create a subset of `dat` containing the data for your country plus all the neighbor countries (if you come from an island, the nearest countries...). For me, `dat_france_region` will contain data from France, Spain, Italy, Switzerland, Germany, Luxembourg and Belgium.

```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
dat_france_region <- dat |>
        filter(Country %in% c("France", "Spain", "Italy", "Switzerland", "Germany", "Luxembourg", "Belgium"))
```

5. Plot again income and fertility as a function of the years, but add a color corresponding to the country and a point size to its population:

```{r echo=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
ggplot(data=dat_france_region, aes(x=Year, y=Income, col=Country, size=Population))+
    ggtitle("Household income in France and its region")+
    xlab("Year")+
    ylab("Household income per capita per year [constant $]")+
    annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
    annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
    geom_point(alpha=0.5)
ggplot(data=dat_france_region, aes(x=Year, y=Fertility, col=Country, size=Population))+
    ggtitle("Fertility in France and its region")+
    xlab("Year")+
    lims(y=c(0,5))+
    ylab("Children per woman")+
    annotate("rect", xmin=1914, xmax=1918, ymin=-Inf, ymax=Inf, alpha=.3)+
    annotate("rect", xmin=1939, xmax=1945, ymin=-Inf, ymax=Inf, alpha=.3)+
    geom_point(alpha=.5)
```

6. Load the library `plotly` and make the previous graphs interactive. You can make an interactive graph by calling `ggplotly()`, like that:

```{r include=TRUE, warning = FALSE, message=FALSE, cache=FALSE}
library(plotly)
P <- ggplot(data = dat_france, aes(x=Population, y=Income))+
        geom_point()
ggplotly(P) # add dynamicTicks=TRUE allows redrawing ticks when zooming in
```


7. Finally, you can add a slider to the interactive graph allowing selecting a value for another variable (just like in the video) by adding the keyword `frame =` in the chart's aesthetics. So now, make the graph of the video ! (you can also add the aesthetics `id=Country` to show the country name in the popup when hovering on a point).


```{r include=params$solution, warning = FALSE, message=FALSE, cache=FALSE}
library(plotly)
P <- ggplot(data=dat, aes(x=Income, 
                          y=Fertility, 
                          frame=Year, 
                          col=Religion, 
                          size=Population,
                          id=Country))+
    geom_point(alpha=0.5)+
    ggtitle("Fertility vs. Income in the World")+
    xlab("Household income per capita per year [constant $]")+
    lims(y=c(0,8))+
    scale_size(range=c(1, 15))+
    ylab("Children per woman")
ggplotly(P, dynamicTicks = TRUE) |> 
  layout(yaxis = list(autorange = FALSE), 
         xaxis = list(autorange = FALSE, type = 'log', range=list(2, 5.5)))
```
