.Rhistory

Count = 0
)
# Iterate over tags and update Count using regex
for (i in seq_along(year_df$Tags)) {
tag <- year_df$Tags[i]
regex_pattern <- paste0("\\b", tag, "\\b")  # Use word boundaries to match whole tags
year_df$Count[i] <- sum(grepl(regex_pattern, DF$Manual.Tags[DF$Publication.Year == year]))
}
# Bind the dataframe to DDFF
DDFF <- rbind(DDFF, year_df)
}
# Reset row names
rownames(DDFF) <- NULL
# Plot the bar chart
gg_plot <- ggplot(DDFF, aes(x = Publication.Year, y = Count, fill = Tags)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Distribution of tags throught the years",
x = "Publication Year",
y = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "tags_distributed_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "tags_distributed_by_year.csv")
write.csv(DDFF, file = file_path, row.names = FALSE)
# ANALYSIS : the distribution of 3 tags only
# Assuming your data frame is named DF with a column Tags
DF <- subset1_ZOTEROLIB
# Create an empty dataframe DDFF
DDFF <- data.frame()
# Create a small set of 3 tags
small_set_tags <- tags_counts_df %>%
filter(Tag %in% c("_therapy", "_mental disorder", "_well-being"))
# Iterate over unique Publication.Year values
for (year in unique(DF$Publication.Year)) {
# Extract tags for the current year
year_tags <- unlist(strsplit(gsub(" ", "", DF$Manual.Tags[DF$Publication.Year == year]), ";"))
# Initialize a data frame for the current year
year_df <- data.frame(
Publication.Year = rep(year, length(small_set_tags$Tag)),
Tags = small_set_tags$Tag,
Count = 0
)
# Iterate over tags and update Count using regex
for (i in seq_along(year_df$Tags)) {
tag <- year_df$Tags[i]
regex_pattern <- paste0("\\b", tag, "\\b")  # Use word boundaries to match whole tags
year_df$Count[i] <- sum(grepl(regex_pattern, DF$Manual.Tags[DF$Publication.Year == year]))
}
# Bind the dataframe to DDFF
DDFF <- rbind(DDFF, year_df)
}
# Reset row names
rownames(DDFF) <- NULL
# Plot the bar chart
gg_plot <- ggplot(DDFF, aes(x = Publication.Year, y = Count, fill = Tags)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Distribution of 3 selected tags throught the years",
x = "Publication Year",
y = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "small_set_tags_distributed_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "samll_set_tags_distributed_by_year.csv")
write.csv(DDFF, file = file_path, row.names = FALSE)
# ANALYSIS : Number of authors per work
# Load dplyr library if not already loaded
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
library(dplyr)
# Function to count authors
count_authors <- function(authors) {
# Use strsplit to split authors by ';'
author_list <- unlist(strsplit(authors, ';'))
# Return the length of the list
return(length(author_list))
}
DF <- subset1_ZOTEROLIB
# Use mutate to create a new column 'AuthorCount'
DF <- DF %>%
mutate(AuthorCount = sapply(Author, count_authors))
# Group by 'Publication.Year' and 'AuthorCount' and count the number of items
bubble_data <- DF %>%
group_by(Publication.Year, AuthorCount) %>%
summarize(Number_of_Items = n())
# Create the bubble chart
gg_plot <- ggplot(bubble_data, aes(x = Publication.Year, y = AuthorCount, size = Number_of_Items)) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(3, 15)) +
labs(title = "Bubble Chart - Number of Authors per work",
x = "Publication Year",
y = "Number of Authors per work",
size = "Number of articles")
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "nb_author_per_work_per_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "nb_author_per_work_per_year.csv")
write.csv(bubble_data, file = file_path, row.names = FALSE)
## RECONCILIATION with WIKIDATA
# We are going to use OpenRefine and Wikidata to match the names of the journals with their
# respectives QID and then retrieve the date of inception (creation date) of the journals
# if that data is indexed in Wikidata
# 1. Install and open the software OpenRefine.
# 2. Create a new project with the file output/all_journal_titles_counts.csv
# 3. Reconcile Publication.Title (details not described, a tutorial here https://www.wikidata.org/wiki/User:Pmartinolli/Tutoriel_chercheur)
# 4. Add a new column based on this reconciled data : Qid
# 5. Add a new column based on this reconciled data : Inception
# 6. Add a new column based on this reconciled data : CountryOfOrigin
# 7. Add a new column based on CountryOfOrigin : Qid_CoO
# 6. Export the results into a file journal_titles_reconciled.csv and put that file in the working folder (at the same root level as "My library.csv")
# Load the data
# Read the CSV file into a variable (e.g., data_frame) with specific options
file_path <- "journal_titles_reconciled.csv"
journal_titles_reconciled <- read.table(file_path, header = TRUE, sep = ",", encoding = "UTF-8")
# Turn the date in this format "2014-08-01T00:00:00Z" into "YYYY"
# install.packages("lubridate")
library(lubridate)
# Convert the 'Inception' column to a datetime object
journal_titles_reconciled$Inception <- ymd_hms(journal_titles_reconciled$Inception)
# Extract the year and create a new column 'Year'
journal_titles_reconciled$Year <- year(journal_titles_reconciled$Inception)
# Add a new field in the data frame corresponding to the year the Journal was created (Publication.Inception)
# and adding also a calculated field :
# PubAfterInception = Publication Year of the article minus Publication.Inception of the journal
# It is supposed to reflect if an article is published in a new journal or in an old traditional one
# Initialize a new column 'Publication.Inception' in ZOTEROLIB with NAs
ZOTEROLIB$Publication.Inception <- NA
# Loop through each row in ZOTEROLIB
for (i in 1:nrow(ZOTEROLIB)) {
# Get the Publication.Title for the current row in ZOTEROLIB
current_title <- ZOTEROLIB$Publication.Title[i]
# Check if there is a match in data$Publication.Title
match_row <- match(current_title, journal_titles_reconciled$Publication.Title)
# If there is a match, assign the corresponding Inception value
if (!is.na(match_row)) {
ZOTEROLIB$Publication.Inception[i] <- as.numeric(journal_titles_reconciled$Year[match_row])
}
}
# Create a new field 'PubAfterInception' in ZOTEROLIB
ZOTEROLIB$PubAfterInception <- NA
# Check if 'Publication.Inception' is not NA and calculate 'PubAfterInception'
ZOTEROLIB$PubAfterInception[!is.na(ZOTEROLIB$Publication.Inception)] <-
ZOTEROLIB$Publication.Year[!is.na(ZOTEROLIB$Publication.Inception)] -
ZOTEROLIB$Publication.Inception[!is.na(ZOTEROLIB$Publication.Inception)]
# Rebuilt a new subset1 (because we changed the ZOTEROLIB data frame)
# Creating a subset of the dataframe to match certain documents only
# Item.Type = "journalArticle"
# and Tags contains "_peer reviewed"
# and Tags contains "_TTRPG"
# Create a first subset based on your criteria
subset1_ZOTEROLIB <- subset(ZOTEROLIB,
Item.Type == "journalArticle" &
grepl("_peer reviewed", Manual.Tags) &
grepl("_TTRPG", Manual.Tags))
# Remove rows with NA values in PubAfterInception using complete.cases
subset1_ZOTEROLIB <- subset1_ZOTEROLIB[complete.cases(subset1_ZOTEROLIB$PubAfterInception), ]
# Create a ggplot bar plot for the distribution of PubAfterInception in the subset
ggplot(subset1_ZOTEROLIB, aes(x = factor(PubAfterInception))) +
geom_bar() +
labs(title = "Article's publication year following the establishment of the journal",
x = "Years after creation",
y = "Count") +
theme_minimal()
# Add a new field in the data frame corresponding to the Country of Origin of the Journal and the Qid of this CoO
# Initialize a new column 'CountryofOrigin' in ZOTEROLIB with NAs
ZOTEROLIB$CountryOfOrigin <- NA
ZOTEROLIB$Qid_CoO <- NA
# Loop through each row in ZOTEROLIB
for (i in 1:nrow(ZOTEROLIB)) {
# Get the Publication.Title for the current row in ZOTEROLIB
current_title <- ZOTEROLIB$Publication.Title[i]
# Check if there is a match in data$Publication.Title
match_row <- match(current_title, journal_titles_reconciled$Publication.Title)
# If there is a match, assign the corresponding values
if (!is.na(match_row)) {
ZOTEROLIB$CountryOfOrigin[i] <- journal_titles_reconciled$CountryOfOrigin[match_row]
ZOTEROLIB$Qid_CoO[i] <- journal_titles_reconciled$Qid_CoO[match_row]
}
}
# Rebuilt a new subset1 (because we changed the ZOTEROLIB data frame, again)
# Creating a subset of the dataframe to match certain documents only
# Item.Type = "journalArticle"
# and Tags contains "_peer reviewed"
# and Tags contains "_TTRPG"
# Create a first subset based on your criteria
subset1_ZOTEROLIB <- subset(ZOTEROLIB,
Item.Type == "journalArticle" &
grepl("_peer reviewed", Manual.Tags) &
grepl("_TTRPG", Manual.Tags))
# Remove rows with NA values in CountryOfOrigin using complete.cases
subset1_ZOTEROLIB <- subset1_ZOTEROLIB[complete.cases(subset1_ZOTEROLIB$CountryOfOrigin), ]
# Calculate the countries with counting the number of publications
CountryOfOrigin_count <- subset1_ZOTEROLIB %>%
group_by(CountryOfOrigin) %>%
summarise(Count = n())
# Remove NA
CountryOfOrigin_count <- subset(CountryOfOrigin_count, !is.na(CountryOfOrigin) & CountryOfOrigin != "")
# Create a bar plot
# Assuming merged_data is your data frame with columns 'Year' and 'Count'
gg_plot <- ggplot(CountryOfOrigin_count, aes(y = reorder(`CountryOfOrigin`, Count), x = Count)) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.7) +
labs(title = "Journal's country of origin", y = "Country of origin", x = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "journals_CountryOfOrigin.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Export the data as a CSV file with column names
file_path <- file.path(output_folder, "journals_CountryOfOrigin.csv")
write.csv(CountryOfOrigin_count, file = file_path, row.names = FALSE)
## Creating a word cloud
# based on the code by Céline Van den Rul at https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a
# Load wordcloud library if not already loaded
if (!requireNamespace("wordcloud", quietly = TRUE)) {
install.packages("wordcloud")
}
library(wordcloud)
# Load RColorBrewer library if not already loaded
if (!requireNamespace("RColorBrewer", quietly = TRUE)) {
install.packages("RColorBrewer")
}
library(RColorBrewer)
# Load wordcloud2 library if not already loaded
if (!requireNamespace("wordcloud2", quietly = TRUE)) {
install.packages("wordcloud2")
}
library(wordcloud2)
# Load tm library if not already loaded
if (!requireNamespace("tm", quietly = TRUE)) {
install.packages("tm")
}
library(tm)
#Create a vector containing only the text
text <- subset1_ZOTEROLIB$Title
# Create a corpus
docs <- Corpus(VectorSource(text))
# Clean the texts
library(dplyr)
docs <- docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, stopwords("french"))
# Create a document-term-matrix
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)
# Generate the word cloud, method 1
set.seed(1234)
wordcloud(words = df$word, freq = df$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# Generate the word cloud, method 2
wordcloud2(data=df, size=1.6, color='random-dark')
## Analysis : Thesis
# Load tidyr library if not already loaded
if (!requireNamespace("tidyr", quietly = TRUE)) {
install.packages("tidyr")
}
library(tidyr)
# Load dplyr library if not already loaded
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
library(dplyr)
subset2_ZOTEROLIB <- subset(ZOTEROLIB,
(Item.Type == "thesis" &
grepl("_TTRPG", Manual.Tags)))
DF <- subset2_ZOTEROLIB
# Remove all Type of thesis = Bachelor stuff (starts by B or R of D or H or I or C or S)
DF <- DF %>%
filter(!grepl("^[BRDHCS]", Type))
# Normalized Type field into TypeNormalized
# if Type starts by T P or D, then its a PhD
# if Type starts by M, then its a Master
DF <- DF %>%
mutate(TypeNormalized = case_when(
grepl("^[TPD]", Type) ~ "PhD",
grepl("^M", Type) ~ "Master",
TRUE ~ Type
))
# Now select only the PhD and the Master
DF <- DF %>%
filter(TypeNormalized %in% c("PhD", "Master"))
# Create a bar plot
# Assuming DF is your data frame with columns 'Year' and 'Count'
gg_plot <- ggplot(DF, aes(x = Publication.Year, fill = TypeNormalized)) +
geom_bar(position = position_stack(reverse = TRUE), stat = "count") +
labs(title = "Master and Doctoral Thesis Distributed by Year", x = "Year", y = "Count") +
theme_minimal()
# After runing the line under, a graphic is supposed to be displayed in the frame at the right of this one
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "thesis_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Export the data as a CSV file with column names
# Create separate columns for Count PhD and Count Master
DF_counts <- DF %>%
group_by(Publication.Year, TypeNormalized) %>%
summarise(Count = n())
# Pivot the data to wide format
DF_counts_wide <- pivot_wider(DF_counts, names_from = TypeNormalized, values_from = Count)  #library(tidyr)
file_path <- file.path(output_folder, "thesis_by_year.csv")
write.csv(DF_counts_wide, file = file_path, row.names = FALSE)
# Load tidyverse library if not already loaded
if (!requireNamespace("tidyverse", quietly = TRUE)) {
install.packages("tidyverse")
}
library(tidyverse)
# Filter rows where TypeNormalized is "PhD" and Num.Pages is not NA
filtered_df_phd <- DF %>% filter(TypeNormalized == "PhD" & !is.na(Num.Pages))
# Filter rows where TypeNormalized is "Master" and Num.Pages is not NA
filtered_df_master <- DF %>% filter(TypeNormalized == "Master" & !is.na(Num.Pages))
# Calculate the average of Num.Pages for PhD
average_num_pages_phd <- mean(filtered_df_phd$Num.Pages, na.rm = TRUE)
# Calculate the average of Num.Pages for Master
average_num_pages_master <- mean(filtered_df_master$Num.Pages, na.rm = TRUE)
# Create a scatter plot with averages displayed
gg_plot <- ggplot() +
geom_point(data = filtered_df_phd, aes(x = jitter(rep(2, nrow(filtered_df_phd)), amount = 0.1), y = Num.Pages), color = "red") +
geom_text(data = filtered_df_master, aes(x = 1, y = average_num_pages_master,
label = sprintf("Avg (Master): %.2f", average_num_pages_master)),
vjust = -0.5, hjust = -0.5, color = "darkgreen", size = 6) +
geom_point(data = filtered_df_master, aes(x = jitter(rep(1, nrow(filtered_df_master)), amount = 0.1), y = Num.Pages), color = "green") +
geom_text(data = filtered_df_phd, aes(x = 2, y = average_num_pages_phd,
label = sprintf("Avg (PhD): %.2f", average_num_pages_phd)),
vjust = -0.5, hjust = 1.3, color = "darkred", size = 6) +
labs(title = "Average Number of Pages for PhD and Master Thesis (excluding NA)",
x = " ",
y = "Number of Pages")
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "avg_nb_pages_per_thesis.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Exporting the list of universities for another round of reconciliation with Wikidata data to retrieve Countries
# Replace 'DF$Publisher' with the column you want to export
column_to_export <- DF$Publisher
# Convert the column to a data frame with a custom name
df_to_export <- data.frame(Universities = column_to_export)
# Specify the file name
file_name <- paste0(output_folder, "/", "universities2reconcile.csv")
# Export the data frame to a CSV file
write.csv(df_to_export, file = file_name, row.names = FALSE)
# Open this csv with OpenRefine
# Duplicate the first column into a new column named UniversitiesWD
# Reconcile the UniversitiesWD column (to keep the first column identical and allow joining the data later)
# Add a new column based on the reconciled data : UniversitiesWD.QID
# Add a new column based on the reconciled data : Country
# Add a new column based on the reconciled data : Country.QID
# Export the file as comma-separated value "universities-reconciled.csv"
# "Universities", "UniversitiesWD", "UCountry", "UniversitiesWD.QID", "UCountry.QID"
# and put it at the root of the working folder
# Load the data
# Read the CSV file into a variable (e.g., data_frame) with specific options
file_path <- "universities-reconciled.csv"
universities_reconciled <- read.csv(file_path, header = FALSE, col.names = c("Universities", "UniversitiesWD", "UCountry", "UniversitiesWD.QID", "UCountry.QID"))
# merge the data
DF <- merge(DF, universities_reconciled, by.x = 'Publisher', by.y = 'Universities', all = FALSE)
DF <- distinct(DF, Key, .keep_all = TRUE)
count_data <- DF %>%
group_by(UCountry, TypeNormalized) %>%
summarize(count = n())
# Create the bar plot with reordered x-axis and categorized bars
gg_plot <- ggplot(count_data, aes(x = reorder(UCountry, count), y = count, fill = TypeNormalized)) +
geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
labs(title = "Country Distribution", x = "Country", y = "Count") +
coord_flip()  # Use coord_flip() to flip the x and y axes
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "thesis_by_countries.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
file_path <- file.path(output_folder, "thesis_by_countries.csv")
write.csv(count_data, file = file_path, row.names = FALSE)
## Final Export of the enriched ZOTEROLIB data frame into a csv
file_name <- "My library.csv"
ZOTEROLIB <- read.table(file_name, header = TRUE, sep = ",", encoding = "UTF-8")
# Full join with journal_titles_reconciled
ZOTEROLIB <- full_join(ZOTEROLIB, journal_titles_reconciled, by = c('Publication.Title' = 'Publication.Title'), relationship = "many-to-many")
# Full join with universities_reconciled
ZOTEROLIB <- full_join(ZOTEROLIB, universities_reconciled, by = c('Publisher' = 'Universities'), relationship = "many-to-many")
# Remove duplicates based on the Key field
ZOTEROLIB <- distinct(ZOTEROLIB, Key, .keep_all = TRUE)
# Export the data as a CSV file with column names
file_path <- file.path(output_folder, "My_library_enriched.csv")
write.csv(ZOTEROLIB, file = file_path, row.names = FALSE)
# Create a small set of 4 tags
small_set_tags <- tags_counts_df %>%
filter(Tag %in% c("_therapy", "_mental disorder", "_well-being", "_anxiety"))
# Iterate over unique Publication.Year values
for (year in unique(DF$Publication.Year)) {
# Extract tags for the current year
year_tags <- unlist(strsplit(gsub(" ", "", DF$Manual.Tags[DF$Publication.Year == year]), ";"))
# Initialize a data frame for the current year
year_df <- data.frame(
Publication.Year = rep(year, length(small_set_tags$Tag)),
Tags = small_set_tags$Tag,
Count = 0
)
# Iterate over tags and update Count using regex
for (i in seq_along(year_df$Tags)) {
tag <- year_df$Tags[i]
regex_pattern <- paste0("\\b", tag, "\\b")  # Use word boundaries to match whole tags
year_df$Count[i] <- sum(grepl(regex_pattern, DF$Manual.Tags[DF$Publication.Year == year]))
}
# Bind the dataframe to DDFF
DDFF <- rbind(DDFF, year_df)
}
# Reset row names
rownames(DDFF) <- NULL
# Plot the bar chart
gg_plot <- ggplot(DDFF, aes(x = Publication.Year, y = Count, fill = Tags)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Distribution of 4 selected tags throught the years",
x = "Publication Year",
y = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "small_set_tags_distributed_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "samll_set_tags_distributed_by_year.csv")
write.csv(DDFF, file = file_path, row.names = FALSE)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "small_set_tags_distributed_by_year.csv")
write.csv(DDFF, file = file_path, row.names = FALSE)
small_set_tags <- tags_counts_df %>%
filter(Tag %in% c("_moral panic"))
# Iterate over unique Publication.Year values
for (year in unique(DF$Publication.Year)) {
# Extract tags for the current year
year_tags <- unlist(strsplit(gsub(" ", "", DF$Manual.Tags[DF$Publication.Year == year]), ";"))
# Initialize a data frame for the current year
year_df <- data.frame(
Publication.Year = rep(year, length(small_set_tags$Tag)),
Tags = small_set_tags$Tag,
Count = 0
)
# Iterate over tags and update Count using regex
for (i in seq_along(year_df$Tags)) {
tag <- year_df$Tags[i]
regex_pattern <- paste0("\\b", tag, "\\b")  # Use word boundaries to match whole tags
year_df$Count[i] <- sum(grepl(regex_pattern, DF$Manual.Tags[DF$Publication.Year == year]))
}
# Bind the dataframe to DDFF
DDFF <- rbind(DDFF, year_df)
}
# Reset row names
rownames(DDFF) <- NULL
# Plot the bar chart
gg_plot <- ggplot(DDFF, aes(x = Publication.Year, y = Count, fill = Tags)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Distribution of 4 selected tags throught the years",
x = "Publication Year",
y = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "small_set_tags_distributed_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Assuming your data frame is named DF with a column Tags
DF <- subset1_ZOTEROLIB
# Create an empty dataframe DDFF
DDFF <- data.frame()
small_set_tags <- tags_counts_df %>%
filter(Tag %in% c("_moral panic"))
# Iterate over unique Publication.Year values
for (year in unique(DF$Publication.Year)) {
# Extract tags for the current year
year_tags <- unlist(strsplit(gsub(" ", "", DF$Manual.Tags[DF$Publication.Year == year]), ";"))
# Initialize a data frame for the current year
year_df <- data.frame(
Publication.Year = rep(year, length(small_set_tags$Tag)),
Tags = small_set_tags$Tag,
Count = 0
)
# Iterate over tags and update Count using regex
for (i in seq_along(year_df$Tags)) {
tag <- year_df$Tags[i]
regex_pattern <- paste0("\\b", tag, "\\b")  # Use word boundaries to match whole tags
year_df$Count[i] <- sum(grepl(regex_pattern, DF$Manual.Tags[DF$Publication.Year == year]))
}
# Bind the dataframe to DDFF
DDFF <- rbind(DDFF, year_df)
}
# Reset row names
rownames(DDFF) <- NULL
# Plot the bar chart
gg_plot <- ggplot(DDFF, aes(x = Publication.Year, y = Count, fill = Tags)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Distribution of 4 selected tags throught the years",
x = "Publication Year",
y = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "small_set_tags_distributed_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
library(readr)
game_citation_data <- read_csv("C:/00-Yragatext/_working_directory/game_citation_data.csv")
View(game_citation_data)