-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_wb_data.R
133 lines (125 loc) · 5.54 KB
/
get_wb_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library(wbstats)
library(dplyr)
library(tidyr)
library(jsonlite)
library(stringr)
# Retrieve gender indicators from the World Bank API
cache = wb_cache()
countries = wb_countries()
indicators =
c('SP.POP.TOTL', #Population
'SL.TLF.CACT.FE.NE.ZS', 'SL.TLF.CACT.MA.NE.ZS', #Labour force participation
'SP.DYN.LE00.FE.IN', 'SP.DYN.LE00.MA.IN', #Life expectancy
'SL.UEM.TOTL.FE.ZS', 'SL.UEM.TOTL.MA.ZS', #Unemployment
'SP.DYN.IMRT.FE.IN', 'SP.DYN.IMRT.MA.IN') #Infant mortality
gender_data = wb_data(indicator = indicators, country = 'countries_only',
start_date = 2000, end_date = format(Sys.Date(), '%Y'),
return_wide = FALSE, cache = cache)
# Filter by most recent date
wb_data_latest = gender_data %>%
select(-footnote) %>%
filter(complete.cases(value)) %>%
group_by(country, indicator_id) %>%
filter(date == max(date)) %>%
ungroup()
# workaround for indicators missing iso codes
country_names = unique(wb_data_latest$country)
indicators_pt2 =
c('SP.DYN.SMAM.FE', 'SP.DYN.SMAM.MA', #Age at first marriage
'SH.DTH.INJR.1534.FE.ZS', 'SH.DTH.INJR.1534.MA.ZS') #Injury
url_body = 'https://api.worldbank.org/v2/en/country/all/indicator/'
gender_data_pt2 = NULL
for (indicator in indicators_pt2){
gender_data_pt2 =
fromJSON(paste0(url_body, indicator, '?format=json&per_page=20000')) %>%
as.data.frame() %>%
filter(complete.cases(value)
& country$value %in% country_names
& date >= 2000) %>%
group_by(country) %>%
filter(date == max(date)) %>%
ungroup() %>%
bind_rows(gender_data_pt2)
}
gender_data_pt2 = gender_data_pt2 %>%
mutate(indicator_id = indicator$id,
indicator = indicator$value,
iso3c = country$id,
country = country$value) %>%
select(indicator_id, indicator, country, iso3c, date, value)
wb_data_latest$date = as.character(wb_data_latest$date)
wb_data_latest = bind_rows(wb_data_latest, gender_data_pt2)
# Reshape so male and female are columns
# If years are different for male and female data, paste together with a slash
population_latest = wb_data_latest %>%
filter(indicator_id == 'SP.POP.TOTL') %>%
select(iso3c, date, value)
gender_latest = filter(wb_data_latest, indicator_id != 'SP.POP.TOTL')
gender_reshape = gender_latest %>%
left_join(select(cache$indicators, indicator_id, indicator_desc)) %>%
mutate(Sex = case_when(
str_detect(indicator, 'female') ~ 'female',
str_detect(indicator, ' male') ~ 'male'
)) %>% mutate_at(c('indicator', 'indicator_desc'),
str_remove_all, ' males| females|, female|, male| female| male') %>%
mutate(indicator = str_replace_all(indicator, ' ', ' ')) %>%
group_by(country, indicator) %>%
mutate(indicator_year
= ifelse(n_distinct(date) == 1, date, paste0(date, collapse='/'))) %>%
group_by(country) %>%
filter(n() == length(indicators) + length(indicators_pt2) - 1) %>%
ungroup() %>%
select(-indicator_id, -date) %>%
spread(Sex, value)
# Join gender data, population data and indicator descriptions;
# add short names for indicators
bubble_data = population_latest %>%
rename(population_year = date, population = value) %>%
right_join(gender_reshape) %>%
left_join(select(countries, iso3c, region, income_level)) %>%
mutate_at(c('male', 'female'), round, 1) %>%
mutate(short_name =
recode(indicator,
'Mortality rate, infant (per 1,000 live births)'
= 'Infant mortality rate',
'Cause of death, by injury, ages 15-34 (% of relevant age group)'
= 'Deaths from injury, ages 15-34',
'Unemployment (% of labor force) (modeled ILO estimate)'
= 'Unemployment rate',
'Life expectancy at birth (years)'
= 'Life expectancy',
'Labor force participation rate (% of population ages 15+) (national estimate)'
= 'Labor force participation rate')
) %>%
arrange(short_name)
bubble_data$income_level = factor(bubble_data$income_level,
levels = c('Low income', 'Lower middle income',
'Upper middle income', 'High income'))
# Split off indicator descriptions
descriptions = bubble_data %>%
select(short_name, indicator_desc) %>%
distinct()
# Add a column for the dotted parity line - the limits
# of the parity line for each indicator set the plot limits
bubble_countries = unique(bubble_data$country)
bubble_indicators = unique(bubble_data$short_name)
axes_lims = matrix(rep(c(0,100), length(bubble_indicators)),
ncol=2, byrow=T, dimnames = list(bubble_indicators,
c('min', 'max')))
axes_lims['Age at first marriage', ] = c(16, 36)
axes_lims['Infant mortality rate', 2] = 90
axes_lims['Life expectancy', ] = c(50, 90)
axes_lims['Unemployment rate', 2] = 40
line_column = NULL
for(indicator in bubble_indicators){
line_column = c(line_column,
seq(axes_lims[indicator, 1], axes_lims[indicator, 2],
length.out = length(bubble_countries)))
}
bubble_data = bubble_data %>%
arrange(short_name, income_level, region) %>%
mutate(parity_line = line_column,
get_date = format(Sys.Date(), '%B %d, %Y')) %>%
select(-indicator_desc, -iso2c, -unit, -obs_status, -last_updated)
saveRDS(bubble_data, 'data/bubble_data.rds')
saveRDS(descriptions, 'data/descriptions.rds')