Skip to content

Commit

Permalink
Add datasets and R code files
Browse files Browse the repository at this point in the history
  • Loading branch information
calvin-wirawan committed Jan 16, 2025
1 parent 85a4894 commit f401e9f
Show file tree
Hide file tree
Showing 9 changed files with 568 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

# RStudio files
.Rproj.user/
*.Rproj

# produced vignettes
vignettes/*.html
Expand Down Expand Up @@ -47,3 +48,7 @@ po/*~

# RStudio Connect folder
rsconnect/

# Intellij Idea files
.idea/
.DS_Store
Binary file added 2023birthregistrations.xlsx
Binary file not shown.
Binary file added 2023birthsbyparentscountryofbirth.xlsx
Binary file not shown.
11 changes: 11 additions & 0 deletions INSTRUCTION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Instructions on running the code

1. Install R programming language
2. Install IDE that supports R programming language (e.g., RStudio)
3. Clone this GitHub repository to your local environment
4. With your preferred IDE, open the directory where you clone the repository[^1]
5. Install the required R packages
6. Run the commands in any of the four R script files from top to bottom[^2]

[^1]: Make sure that you do not change the folder structure
[^2]: Each R script file runs independently
111 changes: 111 additions & 0 deletions box_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# ======================================= Load Libraries and Data =======================================

# Import required libraries
library(tidyverse)
library(readxl)
library(ggstatsplot)
library(MetBrewer)

# Load data from file 2023birthregistrations.xlsx from sheet "Table_9" and start read the data from the sixth row
# This is England and Wales data in 2023
imdStillBirth2023EW <- read_excel("2023birthregistrations.xlsx", sheet = "Table_9", skip = 5)

# Load data from file cim2022deathcohortworkbook.xlsx from sheet "21" and start read the data from the eight row
# This is England data from 2010 to 2022
imdStillBirth2022E <- read_excel("cim2022deathcohortworkbook.xlsx", sheet = "21", skip = 7)

# Load data from file cim2022deathcohortworkbook.xlsx from sheet "25" and start read the data from the tenth row
# This is Wales data from 2010 to 2022
imdStillBirth2022W <- read_excel("cim2022deathcohortworkbook.xlsx", sheet = "25", skip = 9)

# ======================================= Data Pre-Processing =======================================

# For table imdStillBirth2023EW, add column Year with value 2023
imdStillBirth2023EW <- imdStillBirth2023EW %>% mutate(Year = 2023)

# For table imdStillBirth2023EW, rename column IMD Decile to IMD
imdStillBirth2023EW <- imdStillBirth2023EW %>% rename(IMD = `IMD Decile`)

# Drop all columns besides Year, IMD and Stillbirths
imdStillBirth2023EW <- imdStillBirth2023EW %>% select(`Year`, `IMD`, Stillbirths)
imdStillBirth2022E <- imdStillBirth2022E %>% select(Year, IMD, Stillbirths)
imdStillBirth2022W <- imdStillBirth2022W %>% select(Year, IMD, Stillbirths)

# Merge the data from all three tables
imdStillBirth <- rbind(imdStillBirth2022E, imdStillBirth2022W, imdStillBirth2023EW)

# Remove rows with values "All deciles" or "Total" in column IMD
imdStillBirth <- imdStillBirth %>% filter(`IMD` != "All deciles" & `IMD` != "Total")

# Convert the data in column IMD to numeric class
imdStillBirth <- imdStillBirth %>% mutate(across(2, as.numeric))

# Sum the data based on column Year and IMD
imdStillBirth <- imdStillBirth %>%
group_by(Year, IMD) %>%
summarise(across(everything(), sum))

# ======================================= Data Visualisation =======================================

# Generate combination of box plot, violin plot and jitter plot
ggbetweenstats(
data = imdStillBirth,
x = IMD,
y = Stillbirths,
title = "Number of stillbirths by IMD decile",
xlab = "Index of Multiple Deprivation",
ylab = "Number of Stillbirths",
package = "MetBrewer",
palette = "Redon",
type = "np",
centrality.point.args = list(size = 0),
point.args = list(
position = position_jitterdodge(dodge.width = 0.7),
alpha = 0.7,
size = 3.5,
stroke = 0
),
boxplot.args = list(
width = 0.2,
alpha = 0.3,
fill = "grey85",
colour = "black",
linewidth = 0.7
),
violin.args = list(
width = 0.67,
alpha = 0.1,
colour = "grey30",
linetype = 5
),
partial = FALSE,
results.subtitle = FALSE
) +
geom_segment(
data = imdStillBirth %>%
group_by(IMD) %>%
summarise(median = median(Stillbirths)),
aes(
x = IMD - 0.1,
xend = IMD + 0.1,
y = median,
yend = median
),
colour = "#BF2F24",
size = 1.3
) +
coord_cartesian(
ylim = c(100, 670),
xlim = c(1, 10.1)
) +
theme(
panel.grid.major.x = element_line(color = "grey95"),
panel.grid.major.y = element_line(color = "grey90"),
panel.grid.minor.y = element_line(linetype = 3, color = "grey50"),
plot.title = element_text(face = "bold", size = 30, hjust = 0.5, margin = margin(b = 20)),
axis.title.x = element_text(size = 22, margin = margin(t = 20)),
axis.text.x = element_text(face = "bold", size = 15),
axis.title.y = element_text(size = 22, margin = margin(r = 20)),
axis.text.y = element_text(face = "bold", size = 15),
plot.margin = margin(l = 25, r = -8, b = 15, t = 25)
)
156 changes: 156 additions & 0 deletions butterfly_chart.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# ======================================= Load Libraries and Data =======================================

# Import required libraries
library(tidyverse)
library(readxl)
library(reshape2)
library(ggtext)

# Load data from file 2023birthsbyparentscountryofbirth.xlsx from sheet "Table_2a"
# and start read the data from the ninth row
parentsCountryOfBirth <- read_excel("2023birthsbyparentscountryofbirth.xlsx", sheet = "Table_2a", skip = 8)

# ======================================= Data Pre-Processing =======================================

# Remove all but first row
parentsCountryOfBirth <- parentsCountryOfBirth[1,]

# Get all column name that contain "Percentage of all live births"
selectedColumnName <- grep("Percentage of all live births", colnames(parentsCountryOfBirth))

# Create a list of number from 2023 to 2003 decrement by 5
selectedYears <- 2023 - 5 * (0:4)

# Filter only columns with year in selectedYears
selectedColumnName <- colnames(parentsCountryOfBirth[, selectedColumnName]) %>%
str_subset(paste(selectedYears, collapse = "|"))

# Remove all columns except the columns in selectedColumnName
parentsCountryOfBirth <- parentsCountryOfBirth %>% select(all_of(selectedColumnName))

# Flip the column to become row
parentsCountryOfBirth <- t(parentsCountryOfBirth)

# Rename column name to "Non_UK"
colnames(parentsCountryOfBirth) <- "Non_UK"

# Rename row to number from 1 to 5
rownames(parentsCountryOfBirth) <- c(1:5)

# Convert above matrix to data frame
parentsCountryOfBirth <- as.data.frame(parentsCountryOfBirth)

# Add new column named Year with value from selectedYears
parentsCountryOfBirth <- parentsCountryOfBirth %>%
mutate(Year = selectedYears)

# Change value in column "Non_UK" to numeric
parentsCountryOfBirth <- parentsCountryOfBirth %>%
mutate(across(Non_UK, as.numeric))

# Round all value in column "Non_UK" to 1 decimal place
parentsCountryOfBirth <- parentsCountryOfBirth %>%
mutate(across(Non_UK, ~round(., 1)))

# Create new column named "UK" with value "Non_UK" - 100
parentsCountryOfBirth <- parentsCountryOfBirth %>%
mutate(`UK` = `Non_UK` - 100)

# Transpose the data frame
parentsCountryOfBirth <- melt(parentsCountryOfBirth, id.vars = "Year")

# Sort the data frame by Year
parentsCountryOfBirth <- parentsCountryOfBirth %>%
arrange(Year)

# Update column name
colnames(parentsCountryOfBirth) <- c("Year", "Country of Birth", "Percentage")

# Create new table that only consists of data with "Non_UK" countries
countryNonUK <- subset(parentsCountryOfBirth, `Country of Birth` == "Non_UK")

# Create new table that only consists of data with "UK" countries
countryUK <- subset(parentsCountryOfBirth, `Country of Birth` == "UK")

# Update the Percentage value to positive
countryUK$Percentage <- abs(countryUK$Percentage)

# ======================================= Data Visualisation =======================================

# Generate butterfly chart
ggplot(parentsCountryOfBirth, aes(x = Year, color = `Country of Birth`)) +
geom_linerange(
data = parentsCountryOfBirth[parentsCountryOfBirth$`Country of Birth` == "UK",],
aes(ymin = -2, ymax = -2 + `Percentage` + 66),
linewidth = 20
) +
geom_linerange(data = parentsCountryOfBirth[parentsCountryOfBirth$`Country of Birth` == "Non_UK",],
aes(ymin = 2, ymax = 2 + `Percentage` - 16),
linewidth = 20
) +
geom_label(
aes(x = Year, y = 0, label = Year),
inherit.aes = F,
fontface = "bold",
size = 8,
label.padding = unit(0.0, "lines"),
label.size = 0,
fill = "#ffffff",
color = "black"
) +
geom_text(
data = countryNonUK,
aes(x = Year, y = 2, label = paste0(Percentage, "%")),
nudge_y = 0.37,
family = "Arial Narrow",
fontface = "bold",
colour = "white",
hjust = 0,
size = 6.5
) +
geom_text(
data = countryUK,
aes(x = Year, y = -2, label = paste0(Percentage, "%")),
nudge_y = -0.37,
family = "Arial Narrow",
fontface = "bold",
colour = "white",
hjust = 1,
size = 6.5
) +
scale_color_manual(
name = "",
values = c(`UK` = "#7B2C3CFF", `Non_UK` = "#294F5EFF"),
labels = c("`UK`", "Non_UK")
) +
scale_x_reverse(
breaks = c(seq(2003, 2023, 5))
) +
scale_y_continuous(
limits = c(-17.8, 17.8),
breaks = c(c(-16, -12, -8, -4, 0) + -2, c(0, 4, 8, 12, 16) + 2),
labels = c("82", "78", "74", "70", "66", "16", "20", "24", "28", "32")
) +
coord_flip() +
labs(
title = "Live birth percentage by mother's country of birth",
subtitle = "<b><span style='color:#7B2C3CFF '>Red bar</span></b> represents <span
style='color:black'><i>\"UK\"</i></span> countries. <b><span style='color:#294F5EFF'>Blue bar</span></b>
represents <span style='color:black'><i>\"Non-UK\"</i></span> countries.",
x = "Number of Live Births",
y = "Year"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 28, hjust = 0, margin = margin(l = 55, b = 12)),
plot.subtitle = element_markdown(size = 19, hjust = 0, margin = margin(l = 55, b = 23), color = "grey35"),
panel.grid.major.x = element_line(linetype = 5, color = "grey83"),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
axis.title = element_blank(),
axis.text.x = element_text(face = "bold", size = 18.5, color = "black", margin = margin(t = 15)),
axis.text.y = element_blank(),
plot.margin = margin(l = 0, r = 0, b = 20, t = 30),
)
Binary file added cim2022deathcohortworkbook.xlsx
Binary file not shown.
Loading

0 comments on commit f401e9f

Please sign in to comment.