The data this week comes from nu3 and was contributed by Kasia Kulma.
Kasia has put together a great guide on webscraping along with data cleaning and organization! Make sure to check out her blog post, and the raw code is duplicated as part of the cleaning script.
# Get the Data
food_consumption <- readr::read_csv('')
# Or read in with tidytuesdayR package (
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE tidytuesdayR version ? from GitHub
# Either ISO-8601 date or year/week works!
# Install via pak::pak("dslc-io/tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2020-02-18')
tuesdata <- tidytuesdayR::tt_load(2020, week = 8)
food_consumption <- tuesdata$food_consumption
variable | class | description |
country | character | Country Name |
food_category | character | Food Category |
consumption | double | Consumption (kg/person/year) |
co2_emmission | double | Co2 Emission (Kg CO2/person/year) |
# Credit to Kasia and minorly edited to create output file and test plot
# Blog post at
url <- ""
# scrape the website
url_html <- read_html(url)
# extract the HTML table
whole_table <- url_html %>%
html_nodes('table') %>%
html_table(fill = TRUE) %>%
table_content <- whole_table %>%
select(-X1) %>% # remove redundant column
filter(!dplyr::row_number() %in% 1:3) # remove redundant rows
raw_headers <- url_html %>%
html_nodes(".thead-icon") %>%
tidy_bottom_header <- raw_headers[28:length(raw_headers)]
raw_middle_header <- raw_headers[17:27]
tidy_headers <- c(
rep(raw_middle_header[1:7], each = 2),
rep(raw_middle_header[8:length(raw_middle_header)], each = 2),
combined_colnames <- paste(tidy_headers, tidy_bottom_header, sep = ';')
colnames(table_content) <- c("Country", combined_colnames)
glimpse(table_content[, 1:10])
long_table <- table_content %>%
# make column names observations of Category variable
tidyr::pivot_longer(cols = -Country, names_to = "Category", values_to = "Values") %>%
# separate food-related information from the metric
tidyr::separate(col = Category, into = c("Food Category", "Metric"), sep = ';')
tidy_table <- long_table %>%
tidyr::pivot_wider(names_from = Metric, values_from = Values) %>%
final_table <- tidy_table %>%
rename(consumption = 3,
co2_emmission = 4) %>%
filter(!stringr::str_detect(food_category, "total"))
clean_table <- final_table %>%
mutate_at(vars(consumption, co2_emmission), parse_number)
clean_table %>%
write_csv(here::here("2020/2020-02-18", "food_consumption.csv"))
clean_table %>%
ggplot(aes(x = fct_reorder(food_category, consumption), y = consumption, color = country)) +
geom_jitter() +
theme(legend.position = "none") +