Add datasets and R code files

calvin-wirawan · Jan 16, 2025 · f401e9f · f401e9f
1 parent 85a4894
commit f401e9f
Show file tree

Hide file tree

Showing 9 changed files with 568 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@
 
 # RStudio files
 .Rproj.user/
+*.Rproj
 
 # produced vignettes
 vignettes/*.html
@@ -47,3 +48,7 @@ po/*~
 
 # RStudio Connect folder
 rsconnect/
+
+# Intellij Idea files
+.idea/
+.DS_Store
diff --git a/2023birthregistrations.xlsx b/2023birthregistrations.xlsx
diff --git a/2023birthsbyparentscountryofbirth.xlsx b/2023birthsbyparentscountryofbirth.xlsx
diff --git a/INSTRUCTION.md b/INSTRUCTION.md
@@ -0,0 +1,11 @@
+## Instructions on running the code
+
+1. Install R programming language
+2. Install IDE that supports R programming language (e.g., RStudio)
+3. Clone this GitHub repository to your local environment
+4. With your preferred IDE, open the directory where you clone the repository[^1]
+5. Install the required R packages
+6. Run the commands in any of the four R script files from top to bottom[^2]
+
+[^1]: Make sure that you do not change the folder structure  
+[^2]: Each R script file runs independently
diff --git a/box_plot.R b/box_plot.R
@@ -0,0 +1,111 @@
+# ======================================= Load Libraries and Data =======================================
+
+# Import required libraries
+library(tidyverse)
+library(readxl)
+library(ggstatsplot)
+library(MetBrewer)
+
+# Load data from file 2023birthregistrations.xlsx from sheet "Table_9" and start read the data from the sixth row
+# This is England and Wales data in 2023
+imdStillBirth2023EW <- read_excel("2023birthregistrations.xlsx", sheet = "Table_9", skip = 5)
+
+# Load data from file cim2022deathcohortworkbook.xlsx from sheet "21" and start read the data from the eight row
+# This is England data from 2010 to 2022
+imdStillBirth2022E <- read_excel("cim2022deathcohortworkbook.xlsx", sheet = "21", skip = 7)
+
+# Load data from file cim2022deathcohortworkbook.xlsx from sheet "25" and start read the data from the tenth row
+# This is Wales data from 2010 to 2022
+imdStillBirth2022W <- read_excel("cim2022deathcohortworkbook.xlsx", sheet = "25", skip = 9)
+
+# ======================================= Data Pre-Processing =======================================
+
+# For table imdStillBirth2023EW, add column Year with value 2023
+imdStillBirth2023EW <- imdStillBirth2023EW %>% mutate(Year = 2023)
+
+# For table imdStillBirth2023EW, rename column IMD Decile to IMD
+imdStillBirth2023EW <- imdStillBirth2023EW %>% rename(IMD = `IMD Decile`)
+
+# Drop all columns besides Year, IMD and Stillbirths
+imdStillBirth2023EW <- imdStillBirth2023EW %>% select(`Year`, `IMD`, Stillbirths)
+imdStillBirth2022E <- imdStillBirth2022E %>% select(Year, IMD, Stillbirths)
+imdStillBirth2022W <- imdStillBirth2022W %>% select(Year, IMD, Stillbirths)
+
+# Merge the data from all three tables
+imdStillBirth <- rbind(imdStillBirth2022E, imdStillBirth2022W, imdStillBirth2023EW)
+
+# Remove rows with values "All deciles" or "Total" in column IMD
+imdStillBirth <- imdStillBirth %>% filter(`IMD` != "All deciles" & `IMD` != "Total")
+
+# Convert the data in column IMD to numeric class
+imdStillBirth <- imdStillBirth %>% mutate(across(2, as.numeric))
+
+# Sum the data based on column Year and IMD
+imdStillBirth <- imdStillBirth %>%
+	group_by(Year, IMD) %>%
+	summarise(across(everything(), sum))
+
+# ======================================= Data Visualisation =======================================
+
+# Generate combination of box plot, violin plot and jitter plot
+ggbetweenstats(
+	data = imdStillBirth,
+	x = IMD,
+	y = Stillbirths,
+	title = "Number of stillbirths by IMD decile",
+	xlab = "Index of Multiple Deprivation",
+	ylab = "Number of Stillbirths",
+	package = "MetBrewer",
+	palette = "Redon",
+	type = "np",
+	centrality.point.args = list(size = 0),
+	point.args = list(
+		position = position_jitterdodge(dodge.width = 0.7),
+		alpha = 0.7,
+		size = 3.5,
+		stroke = 0
+	),
+	boxplot.args = list(
+		width = 0.2,
+		alpha = 0.3,
+		fill = "grey85",
+		colour = "black",
+		linewidth = 0.7
+	),
+	violin.args = list(
+		width = 0.67,
+		alpha = 0.1,
+		colour = "grey30",
+		linetype = 5
+	),
+	partial = FALSE,
+	results.subtitle = FALSE
+) +
+	geom_segment(
+		data = imdStillBirth %>%
+			group_by(IMD) %>%
+			summarise(median = median(Stillbirths)),
+		aes(
+			x = IMD - 0.1,
+			xend = IMD + 0.1,
+			y = median,
+			yend = median
+		),
+		colour = "#BF2F24",
+		size = 1.3
+	) +
+	coord_cartesian(
+		ylim = c(100, 670),
+		xlim = c(1, 10.1)
+	) +
+	theme(
+		panel.grid.major.x = element_line(color = "grey95"),
+		panel.grid.major.y = element_line(color = "grey90"),
+		panel.grid.minor.y = element_line(linetype = 3, color = "grey50"),
+		plot.title = element_text(face = "bold", size = 30, hjust = 0.5, margin = margin(b = 20)),
+		axis.title.x = element_text(size = 22, margin = margin(t = 20)),
+		axis.text.x = element_text(face = "bold", size = 15),
+		axis.title.y = element_text(size = 22, margin = margin(r = 20)),
+		axis.text.y = element_text(face = "bold", size = 15),
+		plot.margin = margin(l = 25, r = -8, b = 15, t = 25)
+	)
diff --git a/butterfly_chart.R b/butterfly_chart.R
@@ -0,0 +1,156 @@
+# ======================================= Load Libraries and Data =======================================
+
+# Import required libraries
+library(tidyverse)
+library(readxl)
+library(reshape2)
+library(ggtext)
+
+# Load data from file 2023birthsbyparentscountryofbirth.xlsx from sheet "Table_2a"
+# and start read the data from the ninth row
+parentsCountryOfBirth <- read_excel("2023birthsbyparentscountryofbirth.xlsx", sheet = "Table_2a", skip = 8)
+
+# ======================================= Data Pre-Processing =======================================
+
+# Remove all but first row
+parentsCountryOfBirth <- parentsCountryOfBirth[1,]
+
+# Get all column name that contain "Percentage of all live births"
+selectedColumnName <- grep("Percentage of all live births", colnames(parentsCountryOfBirth))
+
+# Create a list of number from 2023 to 2003 decrement by 5
+selectedYears <- 2023 - 5 * (0:4)
+
+# Filter only columns with year in selectedYears
+selectedColumnName <- colnames(parentsCountryOfBirth[, selectedColumnName]) %>%
+	str_subset(paste(selectedYears, collapse = "|"))
+
+# Remove all columns except the columns in selectedColumnName
+parentsCountryOfBirth <- parentsCountryOfBirth %>% select(all_of(selectedColumnName))
+
+# Flip the column to become row
+parentsCountryOfBirth <- t(parentsCountryOfBirth)
+
+# Rename column name to "Non_UK"
+colnames(parentsCountryOfBirth) <- "Non_UK"
+
+# Rename row to number from 1 to 5
+rownames(parentsCountryOfBirth) <- c(1:5)
+
+# Convert above matrix to data frame
+parentsCountryOfBirth <- as.data.frame(parentsCountryOfBirth)
+
+# Add new column named Year with value from selectedYears
+parentsCountryOfBirth <- parentsCountryOfBirth %>%
+	mutate(Year = selectedYears)
+
+# Change value in column "Non_UK" to numeric
+parentsCountryOfBirth <- parentsCountryOfBirth %>%
+	mutate(across(Non_UK, as.numeric))
+
+# Round all value in column "Non_UK" to 1 decimal place
+parentsCountryOfBirth <- parentsCountryOfBirth %>%
+	mutate(across(Non_UK, ~round(., 1)))
+
+# Create new column named "UK" with value "Non_UK" - 100
+parentsCountryOfBirth <- parentsCountryOfBirth %>%
+	mutate(`UK` = `Non_UK` - 100)
+
+# Transpose the data frame
+parentsCountryOfBirth <- melt(parentsCountryOfBirth, id.vars = "Year")
+
+# Sort the data frame by Year
+parentsCountryOfBirth <- parentsCountryOfBirth %>%
+	arrange(Year)
+
+# Update column name
+colnames(parentsCountryOfBirth) <- c("Year", "Country of Birth", "Percentage")
+
+# Create new table that only consists of data with "Non_UK" countries
+countryNonUK <- subset(parentsCountryOfBirth, `Country of Birth` == "Non_UK")
+
+# Create new table that only consists of data with "UK" countries
+countryUK <- subset(parentsCountryOfBirth, `Country of Birth` == "UK")
+
+# Update the Percentage value to positive
+countryUK$Percentage <- abs(countryUK$Percentage)
+
+# ======================================= Data Visualisation =======================================
+
+# Generate butterfly chart
+ggplot(parentsCountryOfBirth, aes(x = Year, color = `Country of Birth`)) +
+	geom_linerange(
+		data = parentsCountryOfBirth[parentsCountryOfBirth$`Country of Birth` == "UK",],
+		aes(ymin = -2, ymax = -2 + `Percentage` + 66),
+		linewidth = 20
+	) +
+	geom_linerange(data = parentsCountryOfBirth[parentsCountryOfBirth$`Country of Birth` == "Non_UK",],
+		aes(ymin = 2, ymax = 2 + `Percentage` - 16),
+		linewidth = 20
+	) +
+	geom_label(
+		aes(x = Year, y = 0, label = Year),
+		inherit.aes = F,
+		fontface = "bold",
+		size = 8,
+		label.padding = unit(0.0, "lines"),
+		label.size = 0,
+		fill = "#ffffff",
+		color = "black"
+	) +
+	geom_text(
+		data = countryNonUK,
+		aes(x = Year, y = 2, label = paste0(Percentage, "%")),
+		nudge_y = 0.37,
+		family = "Arial Narrow",
+		fontface = "bold",
+		colour = "white",
+		hjust = 0,
+		size = 6.5
+	) +
+	geom_text(
+		data = countryUK,
+		aes(x = Year, y = -2, label = paste0(Percentage, "%")),
+		nudge_y = -0.37,
+		family = "Arial Narrow",
+		fontface = "bold",
+		colour = "white",
+		hjust = 1,
+		size = 6.5
+	) +
+	scale_color_manual(
+		name = "",
+		values = c(`UK` = "#7B2C3CFF", `Non_UK` = "#294F5EFF"),
+		labels = c("`UK`", "Non_UK")
+	) +
+	scale_x_reverse(
+		breaks = c(seq(2003, 2023, 5))
+	) +
+	scale_y_continuous(
+		limits = c(-17.8, 17.8),
+		breaks = c(c(-16, -12, -8, -4, 0) + -2, c(0, 4, 8, 12, 16) + 2),
+		labels = c("82", "78", "74", "70", "66", "16", "20", "24", "28", "32")
+	) +
+	coord_flip() +
+	labs(
+		title = "Live birth percentage by mother's country of birth",
+		subtitle = "<b><span style='color:#7B2C3CFF '>Red bar</span></b> represents <span
+		style='color:black'><i>\"UK\"</i></span> countries. <b><span style='color:#294F5EFF'>Blue bar</span></b>
+		represents <span style='color:black'><i>\"Non-UK\"</i></span> countries.",
+		x = "Number of Live Births",
+		y = "Year"
+	) +
+	theme_minimal() +
+	theme(
+		legend.position = "none",
+		plot.title = element_text(face = "bold", size = 28, hjust = 0, margin = margin(l = 55, b = 12)),
+		plot.subtitle = element_markdown(size = 19, hjust = 0, margin = margin(l = 55, b = 23), color = "grey35"),
+		panel.grid.major.x = element_line(linetype = 5, color = "grey83"),
+		panel.grid.minor.x = element_blank(),
+		panel.grid.major.y = element_blank(),
+		panel.grid.minor.y = element_blank(),
+		axis.title = element_blank(),
+		axis.text.x = element_text(face = "bold", size = 18.5, color = "black", margin = margin(t = 15)),
+		axis.text.y = element_blank(),
+		plot.margin = margin(l = 0, r = 0, b = 20, t = 30),
+	)
diff --git a/cim2022deathcohortworkbook.xlsx b/cim2022deathcohortworkbook.xlsx