diff --git a/.gitignore b/.gitignore index e75435c..fd984d4 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ # RStudio files .Rproj.user/ +*.Rproj # produced vignettes vignettes/*.html @@ -47,3 +48,7 @@ po/*~ # RStudio Connect folder rsconnect/ + +# Intellij Idea files +.idea/ +.DS_Store diff --git a/2023birthregistrations.xlsx b/2023birthregistrations.xlsx new file mode 100644 index 0000000..533d500 Binary files /dev/null and b/2023birthregistrations.xlsx differ diff --git a/2023birthsbyparentscountryofbirth.xlsx b/2023birthsbyparentscountryofbirth.xlsx new file mode 100644 index 0000000..694e44e Binary files /dev/null and b/2023birthsbyparentscountryofbirth.xlsx differ diff --git a/INSTRUCTION.md b/INSTRUCTION.md new file mode 100644 index 0000000..fe2c950 --- /dev/null +++ b/INSTRUCTION.md @@ -0,0 +1,11 @@ +## Instructions on running the code + +1. Install R programming language +2. Install IDE that supports R programming language (e.g., RStudio) +3. Clone this GitHub repository to your local environment +4. With your preferred IDE, open the directory where you clone the repository[^1] +5. Install the required R packages +6. Run the commands in any of the four R script files from top to bottom[^2] + +[^1]: Make sure that you do not change the folder structure +[^2]: Each R script file runs independently diff --git a/box_plot.R b/box_plot.R new file mode 100644 index 0000000..9a368cd --- /dev/null +++ b/box_plot.R @@ -0,0 +1,111 @@ +# ======================================= Load Libraries and Data ======================================= + +# Import required libraries +library(tidyverse) +library(readxl) +library(ggstatsplot) +library(MetBrewer) + +# Load data from file 2023birthregistrations.xlsx from sheet "Table_9" and start read the data from the sixth row +# This is England and Wales data in 2023 +imdStillBirth2023EW <- read_excel("2023birthregistrations.xlsx", sheet = "Table_9", skip = 5) + +# Load data from file cim2022deathcohortworkbook.xlsx from sheet "21" and start read the data from the eight row +# This is England data from 2010 to 2022 +imdStillBirth2022E <- read_excel("cim2022deathcohortworkbook.xlsx", sheet = "21", skip = 7) + +# Load data from file cim2022deathcohortworkbook.xlsx from sheet "25" and start read the data from the tenth row +# This is Wales data from 2010 to 2022 +imdStillBirth2022W <- read_excel("cim2022deathcohortworkbook.xlsx", sheet = "25", skip = 9) + +# ======================================= Data Pre-Processing ======================================= + +# For table imdStillBirth2023EW, add column Year with value 2023 +imdStillBirth2023EW <- imdStillBirth2023EW %>% mutate(Year = 2023) + +# For table imdStillBirth2023EW, rename column IMD Decile to IMD +imdStillBirth2023EW <- imdStillBirth2023EW %>% rename(IMD = `IMD Decile`) + +# Drop all columns besides Year, IMD and Stillbirths +imdStillBirth2023EW <- imdStillBirth2023EW %>% select(`Year`, `IMD`, Stillbirths) +imdStillBirth2022E <- imdStillBirth2022E %>% select(Year, IMD, Stillbirths) +imdStillBirth2022W <- imdStillBirth2022W %>% select(Year, IMD, Stillbirths) + +# Merge the data from all three tables +imdStillBirth <- rbind(imdStillBirth2022E, imdStillBirth2022W, imdStillBirth2023EW) + +# Remove rows with values "All deciles" or "Total" in column IMD +imdStillBirth <- imdStillBirth %>% filter(`IMD` != "All deciles" & `IMD` != "Total") + +# Convert the data in column IMD to numeric class +imdStillBirth <- imdStillBirth %>% mutate(across(2, as.numeric)) + +# Sum the data based on column Year and IMD +imdStillBirth <- imdStillBirth %>% + group_by(Year, IMD) %>% + summarise(across(everything(), sum)) + +# ======================================= Data Visualisation ======================================= + +# Generate combination of box plot, violin plot and jitter plot +ggbetweenstats( + data = imdStillBirth, + x = IMD, + y = Stillbirths, + title = "Number of stillbirths by IMD decile", + xlab = "Index of Multiple Deprivation", + ylab = "Number of Stillbirths", + package = "MetBrewer", + palette = "Redon", + type = "np", + centrality.point.args = list(size = 0), + point.args = list( + position = position_jitterdodge(dodge.width = 0.7), + alpha = 0.7, + size = 3.5, + stroke = 0 + ), + boxplot.args = list( + width = 0.2, + alpha = 0.3, + fill = "grey85", + colour = "black", + linewidth = 0.7 + ), + violin.args = list( + width = 0.67, + alpha = 0.1, + colour = "grey30", + linetype = 5 + ), + partial = FALSE, + results.subtitle = FALSE +) + + geom_segment( + data = imdStillBirth %>% + group_by(IMD) %>% + summarise(median = median(Stillbirths)), + aes( + x = IMD - 0.1, + xend = IMD + 0.1, + y = median, + yend = median + ), + colour = "#BF2F24", + size = 1.3 + ) + + coord_cartesian( + ylim = c(100, 670), + xlim = c(1, 10.1) + ) + + theme( + panel.grid.major.x = element_line(color = "grey95"), + panel.grid.major.y = element_line(color = "grey90"), + panel.grid.minor.y = element_line(linetype = 3, color = "grey50"), + plot.title = element_text(face = "bold", size = 30, hjust = 0.5, margin = margin(b = 20)), + axis.title.x = element_text(size = 22, margin = margin(t = 20)), + axis.text.x = element_text(face = "bold", size = 15), + axis.title.y = element_text(size = 22, margin = margin(r = 20)), + axis.text.y = element_text(face = "bold", size = 15), + plot.margin = margin(l = 25, r = -8, b = 15, t = 25) + ) diff --git a/butterfly_chart.R b/butterfly_chart.R new file mode 100644 index 0000000..5e21660 --- /dev/null +++ b/butterfly_chart.R @@ -0,0 +1,156 @@ +# ======================================= Load Libraries and Data ======================================= + +# Import required libraries +library(tidyverse) +library(readxl) +library(reshape2) +library(ggtext) + +# Load data from file 2023birthsbyparentscountryofbirth.xlsx from sheet "Table_2a" +# and start read the data from the ninth row +parentsCountryOfBirth <- read_excel("2023birthsbyparentscountryofbirth.xlsx", sheet = "Table_2a", skip = 8) + +# ======================================= Data Pre-Processing ======================================= + +# Remove all but first row +parentsCountryOfBirth <- parentsCountryOfBirth[1,] + +# Get all column name that contain "Percentage of all live births" +selectedColumnName <- grep("Percentage of all live births", colnames(parentsCountryOfBirth)) + +# Create a list of number from 2023 to 2003 decrement by 5 +selectedYears <- 2023 - 5 * (0:4) + +# Filter only columns with year in selectedYears +selectedColumnName <- colnames(parentsCountryOfBirth[, selectedColumnName]) %>% + str_subset(paste(selectedYears, collapse = "|")) + +# Remove all columns except the columns in selectedColumnName +parentsCountryOfBirth <- parentsCountryOfBirth %>% select(all_of(selectedColumnName)) + +# Flip the column to become row +parentsCountryOfBirth <- t(parentsCountryOfBirth) + +# Rename column name to "Non_UK" +colnames(parentsCountryOfBirth) <- "Non_UK" + +# Rename row to number from 1 to 5 +rownames(parentsCountryOfBirth) <- c(1:5) + +# Convert above matrix to data frame +parentsCountryOfBirth <- as.data.frame(parentsCountryOfBirth) + +# Add new column named Year with value from selectedYears +parentsCountryOfBirth <- parentsCountryOfBirth %>% + mutate(Year = selectedYears) + +# Change value in column "Non_UK" to numeric +parentsCountryOfBirth <- parentsCountryOfBirth %>% + mutate(across(Non_UK, as.numeric)) + +# Round all value in column "Non_UK" to 1 decimal place +parentsCountryOfBirth <- parentsCountryOfBirth %>% + mutate(across(Non_UK, ~round(., 1))) + +# Create new column named "UK" with value "Non_UK" - 100 +parentsCountryOfBirth <- parentsCountryOfBirth %>% + mutate(`UK` = `Non_UK` - 100) + +# Transpose the data frame +parentsCountryOfBirth <- melt(parentsCountryOfBirth, id.vars = "Year") + +# Sort the data frame by Year +parentsCountryOfBirth <- parentsCountryOfBirth %>% + arrange(Year) + +# Update column name +colnames(parentsCountryOfBirth) <- c("Year", "Country of Birth", "Percentage") + +# Create new table that only consists of data with "Non_UK" countries +countryNonUK <- subset(parentsCountryOfBirth, `Country of Birth` == "Non_UK") + +# Create new table that only consists of data with "UK" countries +countryUK <- subset(parentsCountryOfBirth, `Country of Birth` == "UK") + +# Update the Percentage value to positive +countryUK$Percentage <- abs(countryUK$Percentage) + +# ======================================= Data Visualisation ======================================= + +# Generate butterfly chart +ggplot(parentsCountryOfBirth, aes(x = Year, color = `Country of Birth`)) + + geom_linerange( + data = parentsCountryOfBirth[parentsCountryOfBirth$`Country of Birth` == "UK",], + aes(ymin = -2, ymax = -2 + `Percentage` + 66), + linewidth = 20 + ) + + geom_linerange(data = parentsCountryOfBirth[parentsCountryOfBirth$`Country of Birth` == "Non_UK",], + aes(ymin = 2, ymax = 2 + `Percentage` - 16), + linewidth = 20 + ) + + geom_label( + aes(x = Year, y = 0, label = Year), + inherit.aes = F, + fontface = "bold", + size = 8, + label.padding = unit(0.0, "lines"), + label.size = 0, + fill = "#ffffff", + color = "black" + ) + + geom_text( + data = countryNonUK, + aes(x = Year, y = 2, label = paste0(Percentage, "%")), + nudge_y = 0.37, + family = "Arial Narrow", + fontface = "bold", + colour = "white", + hjust = 0, + size = 6.5 + ) + + geom_text( + data = countryUK, + aes(x = Year, y = -2, label = paste0(Percentage, "%")), + nudge_y = -0.37, + family = "Arial Narrow", + fontface = "bold", + colour = "white", + hjust = 1, + size = 6.5 + ) + + scale_color_manual( + name = "", + values = c(`UK` = "#7B2C3CFF", `Non_UK` = "#294F5EFF"), + labels = c("`UK`", "Non_UK") + ) + + scale_x_reverse( + breaks = c(seq(2003, 2023, 5)) + ) + + scale_y_continuous( + limits = c(-17.8, 17.8), + breaks = c(c(-16, -12, -8, -4, 0) + -2, c(0, 4, 8, 12, 16) + 2), + labels = c("82", "78", "74", "70", "66", "16", "20", "24", "28", "32") + ) + + coord_flip() + + labs( + title = "Live birth percentage by mother's country of birth", + subtitle = "Red bar represents \"UK\" countries. Blue bar + represents \"Non-UK\" countries.", + x = "Number of Live Births", + y = "Year" + ) + + theme_minimal() + + theme( + legend.position = "none", + plot.title = element_text(face = "bold", size = 28, hjust = 0, margin = margin(l = 55, b = 12)), + plot.subtitle = element_markdown(size = 19, hjust = 0, margin = margin(l = 55, b = 23), color = "grey35"), + panel.grid.major.x = element_line(linetype = 5, color = "grey83"), + panel.grid.minor.x = element_blank(), + panel.grid.major.y = element_blank(), + panel.grid.minor.y = element_blank(), + axis.title = element_blank(), + axis.text.x = element_text(face = "bold", size = 18.5, color = "black", margin = margin(t = 15)), + axis.text.y = element_blank(), + plot.margin = margin(l = 0, r = 0, b = 20, t = 30), + ) diff --git a/cim2022deathcohortworkbook.xlsx b/cim2022deathcohortworkbook.xlsx new file mode 100644 index 0000000..0af7b0c Binary files /dev/null and b/cim2022deathcohortworkbook.xlsx differ diff --git a/dumbbell_plot.R b/dumbbell_plot.R new file mode 100644 index 0000000..22f9c24 --- /dev/null +++ b/dumbbell_plot.R @@ -0,0 +1,180 @@ +# ======================================= Load Libraries and Data ======================================= + +# Import required libraries +library(tidyverse) +library(readxl) +library(ggtext) +library(patchwork) + +# Load data from file 2023birthregistrations.xlsx from sheet "Table_10" and start read the data from the sixth row +maternalAge <- read_excel("2023birthregistrations.xlsx", sheet = "Table_10", skip = 5) + +# ======================================= Data Pre-Processing ======================================= + +# Select rows where country is "England, Wales and Elsewhere" and parent is "Mother" +# and year is greater than or equal to "2004" +maternalAge <- maternalAge %>% filter(Country == "England, Wales and Elsewhere" & + Parent == "Mother" & + Year >= 2004) + +# Drop the columns for country, parent and age-specific fertility rate +maternalAge <- maternalAge %>% select(-Country, -Parent, -`Age-specific fertility rate`) + +# Update data type for column "Number of live births" to numeric +maternalAge <- maternalAge %>% mutate(across(3, as.numeric)) + +# Merge data for value "Under 20" and "20 to 24" and "25 to 29" to become "Under 30" +maternalAge <- maternalAge %>% mutate(`Age group (years)` = + ifelse(`Age group (years)` %in% c("Under 20", "20 to 24", "25 to 29"), "Under 30", "30 and over")) + +# Sum the data based on "Age group (years)" and "Year" +maternalAge <- maternalAge %>% + group_by(Year, `Age group (years)`) %>% + summarise(across(everything(), sum)) + +# Create new column "30 and over" and "Under 30" based on value in column "Number of live births" +maternalAgeFlip <- maternalAge %>% + mutate( + `30 and over` = ifelse(`Age group (years)` == "30 and over", `Number of live births`, 0), + `Under 30` = ifelse(`Age group (years)` == "Under 30", `Number of live births`, 0) + ) + +# Drop column "Age group (years)" and "Number of live births" +maternalAgeFlip <- maternalAgeFlip %>% select(-`Age group (years)`, -`Number of live births`) + +# Merge 2 rows of data based on column "Year" +maternalAgeFlip <- maternalAgeFlip %>% + group_by(Year) %>% + summarise(across(everything(), sum)) + +# Create new column "Gap", to store the gap between the live birth counts of the two age groups in same year +maternalAge <- maternalAge %>% + group_by(Year) %>% + mutate(Gap = abs((sum(`Number of live births`) - 2 * `Number of live births`))) + +# Create new column "Max", to store the maximum value of "Number of live births" for each year +maternalAge <- maternalAge %>% + group_by(Year) %>% + mutate(Max = max(`Number of live births`)) + +# Create new column "Gap", to store the gap between the live birth counts of the two age groups in same year +maternalAgeFlip <- maternalAgeFlip %>% + mutate(Gap = `Under 30` - `30 and over`) + +# Create new column "Max", to store the maximum value between column "Under 30" and column "30 and over" for each year +maternalAgeFlip <- maternalAgeFlip %>% + mutate(Max = pmax(`Under 30`, `30 and over`)) + +# ======================================= Data Visualisation ======================================= + +# Visualise the dumbbell plot and store it to mainPlot variable +mainPlot <- + maternalAge %>% + ggplot(aes(x = `Number of live births`, y = Year)) + + geom_line(aes(group = Year), color = "#E7E7E7", linewidth = 3.5) + + geom_point(aes(color = `Age group (years)`), size = 8) + + geom_text( + aes(label = `Number of live births`), + color = ifelse( + maternalAge$`Age group (years)` == "30 and over", + "#763181FF", + "#025F79FF" + ), + nudge_x = ifelse( + maternalAge$`Number of live births` == maternalAge$Max, + 3000, + -3000 + ), + hjust = ifelse( + maternalAge$`Number of live births` == maternalAge$Max, + 0, + 1 + ), + fontface = "bold", + size = 4.7 + ) + + scale_color_manual(values = c("#8f519a", "#1380A1")) + + scale_y_reverse(breaks = seq(2004, 2023, 1)) + + scale_x_continuous( + limits = c(225000, 400000), + breaks = seq(225000, 400000, 25000) + ) + + coord_cartesian( + xlim = c(230000, 383000), + ylim = c(2023, 2004.5) + ) + + labs( + title = "Number of live births by age group of mothers", + subtitle = "Blue dot shows age group of \"Under 30\". Purple dot + shows age group of \"30 and Over\".", + x = "Number of Live Births", + y = "Year" + ) + + theme_minimal() + + theme( + legend.position = "none", + panel.grid.major.x = element_line(color = "grey93"), + panel.grid.minor.x = element_line(linetype = 2, color = "grey85"), + panel.grid.major.y = element_line(color = "grey95"), + panel.grid.minor.y = element_blank(), + plot.title = element_text(face = "bold", size = 26, hjust = 0, margin = margin(b = 0)), + plot.subtitle = element_markdown(size = 18, hjust = 0, margin = margin(b = 15), color = "grey35"), + axis.title.x = element_text(size = 20, margin = margin(t = 15)), + axis.text.x = element_text(face = "bold", size = 16), + axis.title.y = element_text(size = 22, margin = margin(r = 15)), + axis.text.y = element_text(face = "bold", size = 15), + plot.margin = margin(l = 15, r = 0, b = 10, t = 25) + ) + +# Visualise the gap between age groups and store it to gapPlot variable +gapPlot <- + maternalAgeFlip %>% + ggplot(aes(x = Gap, y = Year)) + + geom_text( + aes( + x = 0, + label = ifelse(Gap < 0, Gap, paste0("+", Gap)) + ), + color = ifelse( + maternalAgeFlip$`Under 30` < maternalAgeFlip$`30 and over`, + "#991300FF", + "#097D5AFF" + ), + fontface = "bold", + size = 5 + ) + + labs( + title = "Gap between age groups", + subtitle = "( #Younger Group + - #Older Group )", + ) + + scale_y_reverse(breaks = seq(2004, 2023, 1)) + + coord_cartesian( + xlim = c(-.05, 0.05), + ylim = c(2023, 2004.5) + ) + + theme_minimal() + + theme( + legend.position = "none", + plot.title = element_text(face = "bold", size = 18, hjust = 0.5, margin = margin(t = 7, b = 11)), + plot.subtitle = element_markdown(size = 12, hjust = 0.5, margin = margin(b = 13), color = "black"), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + panel.grid.minor.y = element_blank(), + panel.grid.major.y = element_line(color = "grey93"), + axis.text.y = element_blank(), + axis.title.y = element_blank(), + axis.text.x = element_blank(), + axis.title.x = element_blank(), + plot.margin = margin(l = 0, r = 0, b = 0, t = 0), + ) + +# Generate the final chart by combining mainPlot (on the left) and gapPlot (on the right) +mainPlot + + gapPlot + + plot_layout(design = + c( + area(l = 0, r = 50, t = 0, b = 1), # defines the main figure area + area(l = 51, r = 61, t = 0, b = 1) # defines the gap figure area + )) diff --git a/lollipop_chart.R b/lollipop_chart.R new file mode 100644 index 0000000..780e121 --- /dev/null +++ b/lollipop_chart.R @@ -0,0 +1,105 @@ +# ======================================= Load Libraries and Data ======================================= + +# Import required libraries +library(tidyverse) +library(readxl) + +# Load data from file 2023birthregistrations.xlsx from tab "Table_12" and start read the data from the sixth row +dailyLiveBirths <- read_excel("2023birthregistrations.xlsx", sheet = "Table_12", skip = 5) + +# ======================================= Data Pre-Processing ======================================= + +# Remove the column for years before 10 years ago +dailyLiveBirths <- dailyLiveBirths %>% select(3:12) + +# Handle missing values by replacing all value of "[z]" with "0" +dailyLiveBirths <- dailyLiveBirths %>% mutate(across(everything(), ~str_replace(., "\\[z\\]", "0"))) + +# Convert all data to numeric class +dailyLiveBirths <- dailyLiveBirths %>% mutate(across(everything(), as.numeric)) + +# Calculate the total number of live births for each year +yearlyLiveBirths <- dailyLiveBirths %>% summarise(across(everything(), sum)) + +# Reorder the column name ascendingly from 2014 to 2023 +yearlyLiveBirths <- yearlyLiveBirths[, order(colnames(yearlyLiveBirths))] + +# Transform the structure to table with two columns: Year and LiveBirths +yearlyLiveBirths <- yearlyLiveBirths %>% + pivot_longer(cols = everything(), names_to = "Year", values_to = "LiveBirths") + +# Calculate the mean live births across all 10 years +meanLiveBirths <- mean(yearlyLiveBirths$LiveBirths) + +# Add new column to assign different color based on +# whether the number of live birth is above or below the mean +yearlyLiveBirths <- yearlyLiveBirths %>% + mutate(Color = ifelse(LiveBirths > meanLiveBirths, "#f8766d", "#619cff")) + +# ======================================= Data Visualisation ======================================= + +# Generate lollipop chart +yearlyLiveBirths %>% + ggplot(aes(x = Year, y = LiveBirths)) + + geom_segment( + aes( + x = Year, + xend = Year, + y = meanLiveBirths, + yend = LiveBirths + ), + color = ifelse(yearlyLiveBirths$Year == "2023", "#BF2F24", + ifelse(yearlyLiveBirths$LiveBirths > meanLiveBirths, "#4f6f7c", "#f8766d")), + alpha = 0.8, + linewidth = ifelse(yearlyLiveBirths$Year == "2023", 2.5, 1.8) + ) + + geom_point( + shape = 21, + color = "#ffffff", + fill = ifelse(yearlyLiveBirths$Year == "2023", "#BF2F24", "#69b3a2"), + size = ifelse(yearlyLiveBirths$Year == "2023", 13, 10.5), + stroke = ifelse(yearlyLiveBirths$Year == "2023", 3.5, 2.5) + ) + + geom_text( + aes(label = ifelse(Year == "2023", "", LiveBirths)), + fontface = "bold", + size = 5, + nudge_y = ifelse(yearlyLiveBirths$LiveBirths > meanLiveBirths, 5500, -5500) + ) + + geom_hline( + yintercept = meanLiveBirths, + color = "#00ba38", + linewidth = 1.3, + linetype = 5 + ) + + annotate( + "text", + x = grep("2023", yearlyLiveBirths$Year) - 4.51, + y = yearlyLiveBirths$LiveBirths[which(yearlyLiveBirths$Year == "2023")], + label = paste( + "Lowest number of live births in the last 10 years:", + yearlyLiveBirths$LiveBirths[which(yearlyLiveBirths$Year == "2023")], + sep = " " + ), + color = "#BF2F24", + size = 7, + fontface = "bold", + hjust = 0 + ) + + labs( + title = "Yearly number of live births in England and Wales", + x = "Year", + y = "Number of Live Births" + ) + + theme_minimal() + + theme( + panel.grid.major.x = element_blank(), + panel.grid.major.y = element_line(color = "grey90"), + panel.grid.minor.y = element_line(linetype = 3, color = "grey55"), + plot.title = element_text(face = "bold", size = 30, hjust = 0.5, margin = margin(b = 15)), + axis.title.x = element_text(size = 24, margin = margin(t = 15)), + axis.text.x = element_text(size = 18), + axis.title.y = element_text(size = 26, margin = margin(r = 20)), + axis.text.y = element_text(size = 16.5), + plot.margin = margin(l = 25, r = 5, b = 15, t = 30) + )