-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_mining_dracula.R
112 lines (82 loc) · 2.69 KB
/
text_mining_dracula.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
##------------------------------------------------------------------------------------------##
## TEXT MINING: DRACULA (BRAM STOKER) ##
##------------------------------------------------------------------------------------------##
## R version 3.3.1 (2016-06-21)
#-------#
# Setup #
#-------#
# Install and load pacman if not already installed
if (!require("pacman")) install.packages("pacman")
library(pacman)
# Load packages
p_load(dplyr, ggplot2, gutenbergr, magrittr, reshape2, stringr, tidyr, tidytext, wordcloud)
#---------------#
# Download text #
#---------------#
# Get corresponding gutenberg_id
stoker <- gutenberg_works(author == "Stoker, Bram")
dracula_id <- stoker[1, "gutenberg_id"]
# Download text
dracula <- gutenberg_download(dracula_id)
#---------------#
# Text cleaning #
#---------------#
# Remove redundant parts
dracula_df <- dracula[162:15482, ]
dracula_df <- dracula_df[, "text"]
# Remove blank lines from text
dracula_df %<>% filter(text != "")
# Unnest and tokenize text
dracula_tidy <- dracula_df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
# Remove numbers and punctuations
dracula_tidy %<>%
filter(!grepl("_", word)) %>%
filter(!grepl("[0-9]", word))
#------------------#
# Word frequencies #
#------------------#
# Find most common words
dracula_wordfreq <- dracula_tidy %>%
count(word, sort = TRUE)
# Plot words
dracula_tidy %>%
count(word, sort = TRUE) %>%
filter(n > 150) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab("Word") + ylab("Frequency") + ggtitle("Most common words in Dracula") +
coord_flip()
#--------------------#
# Sentiment analysis #
#--------------------#
# Get sentiments (nrc) and join
nrc <- get_sentiments("nrc")
dracula_nrc <- dracula_tidy %>%
inner_join(nrc) %>%
count(word, sort = TRUE)
# Calculate and plot total sentiment scores
nrc_counts <- data.frame(table(nrc$sentiment))
# Plot sentiment scores
ggplot(data = nrc_counts, aes(x = Var1, y = Freq)) +
geom_bar(aes(fill = Var1), stat = "identity") +
xlab("Sentiment") + ylab("Count") + ggtitle("Sentiment scores in Dracula") +
theme(legend.position = "none")
#------------#
# Wordclouds #
#------------#
# Set color
pal <- brewer.pal(9, "Reds")[-(1:4)]
# Plot basic wordcloud
dracula_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, colors = pal, max.words = 80))
# Plot sentiment (bing, i.e., positive/negative) wordcloud
dracula_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("darkred", "red2"), max.words = 80)