-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtidyscrooge.R
73 lines (63 loc) · 2.42 KB
/
tidyscrooge.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library(rprojroot)
library(gutenbergr)
library(hrbrthemes)
library(stringi)
library(tidytext)
library(tidyverse)
rt <- find_rstudio_root_file()
carol_rds <- file.path(rt, "data", "carol.rds")
if (!file.exists(carol_rds)) {
# Running this will show that the ID of A Christmas Carol is "46"
# gutenberg_works(author=="Dickens, Charles")
carol_df <- gutenberg_download("46")
write_rds(carol_df, carol_rds)
} else {
carol_df <- read_rds(carol_rds)
}
#' Convenience only
carol_txt <- carol_df$text
# Just want the chapters (staves)
carol_txt <- carol_txt[-(1:(which(grepl("STAVE I:", carol_txt)))-1)]
#' We'll need this later to make prettier facet titles
data_frame(
stave = 1:5,
title = sprintf("Stave %s: %s", stave, carol_txt[stri_detect_fixed(carol_txt, "STAVE")] %>%
stri_replace_first_regex("STAVE [[:alpha:]]{1,3}: ", "") %>%
stri_trans_totitle())
) -> stave_titles
#' Break the text up into chapters, paragraphs, sentences, and words,
#' preserving the hierarchy so we can use it later.
data_frame(txt = carol_txt) %>%
unnest_tokens(chapter, txt, token="regex", pattern="STAVE [[:alpha:]]{1,3}: [[:alpha:] [:punct:]]+") %>%
mutate(stave = 1:n()) %>%
unnest_tokens(paragraph, chapter, token = "paragraphs") %>%
group_by(stave) %>%
mutate(para = 1:n()) %>%
ungroup() %>%
unnest_tokens(sentence, paragraph, token="sentences") %>%
group_by(stave, para) %>%
mutate(sent = 1:n()) %>%
ungroup() %>%
unnest_tokens(word, sentence) -> carol_tokens
#' Retrieve sentiments and compute them.
#'
#' I left the `index` in vs just use `paragraph` since it'll make this easier to reuse
#' this block (which I'm not doing but thought I might).
inner_join(carol_tokens, get_sentiments("nrc"), "word") %>%
count(stave, index = para, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
left_join(stave_titles, "stave") -> carol_with_sent
#' Make the plot
ggplot(carol_with_sent) +
geom_segment(aes(index, sentiment, xend=index, yend=0, color=title), size=0.33) +
scale_x_comma(limits=range(carol_with_sent$index)) +
scale_y_comma() +
scale_color_ipsum() +
facet_wrap(~title, scales="free_x", ncol=5) +
labs(x=NULL, y="Sentiment",
title="Sentiment Analysis of A Christmas Carol",
subtitle="By stave & ¶",
caption="Humbug!") +
theme_ipsum_rc(grid="Y", axis_text_size = 8, strip_text_face = "italic", strip_text_size = 10.5) +
theme(legend.position="none")