-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathYT_ETL_LDA.R
164 lines (129 loc) · 5.94 KB
/
YT_ETL_LDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Author: Suryateja Chalapati
# Date: 5/1/2021
# Load libraries
library(tm)
library(tidytext)
library(topicmodels)
library(ggplot2)
library(dplyr)
library(tidyr)
#install.packages('reshape2')
parent.folder <- "C:/Users/surya/Downloads/Career/M.S. [2020-22]/3.Spring_2021/[ISM6905.001S21.12437 Independent Study]/YT_Corpus [All]/YT_Corpus [2020-2021]"
sub.folders <- list.dirs(parent.folder, recursive=TRUE)[-1]
# Run scripts in sub-folders
for(i in sub.folders) {
tryCatch({
setwd(i)
# Load files into corpus
filenames <- list.files(getwd(), recursive = TRUE, pattern = "*.txt")
# Read files into a character vector
files <- lapply(filenames, readLines)
# Create corpus from vector
docs <- Corpus(VectorSource(files))
# Remove non-alphanumeric characters
remove_alphanum <- content_transformer(function(x, pattern) gsub("[^\x30-\x7A]+", " ", x))
docs <- tm_map(docs, remove_alphanum)
# Remove common English stopwords
docs <- tm_map(docs,removeWords,stopwords("english"))
# Remove punctuation
docs <- tm_map(docs,removePunctuation)
# Remove numbers
docs <- tm_map(docs,removeNumbers)
# Remove whitespace
docs <- tm_map(docs,stripWhitespace)
# Stem document - remove word suffixes
docs <- tm_map(docs,stemDocument)
# Define and eliminate all custom stopwords
myStopwords <- c("logsdon","baab","baabs","baabre","can","one","and","like","just","gonna","know",
"really","right","going","get","well","lot","actually","new",
"will","much","way","and","see","make","look",
"also","able","say","back","got","take","great",
"many","next","using","around","thing","two",
"looking","said","kind","come","put","yeah",
"even","still","ago","every","three","five","gonna",
"okay","whether","seen","six","there","this",
"but","you","want","thats",
"folks","sure","run","and")
docs <- tm_map(docs,removeWords,myStopwords)
# Create document-term matrix
params <- list(minDocFreq = 1,removeNumbers = TRUE,stopwords = TRUE,stemming = TRUE,weighting = weightTf)
dtm <- DocumentTermMatrix(docs, control = params)
# Convert rownames to filenames
rownames(dtm) <- filenames
# Collapse matrix by summing over columns
dtm.matrix <- as.matrix(dtm)
freq <- colSums(dtm.matrix)
# Length should be total number of terms
length(freq)
# Create sort order (descending)
ord <- order(freq,decreasing=TRUE)
# List all terms in decreasing order of freq and write to disk
freq[ord]
write.csv(freq[ord],"word_freq.csv")
# Set parameters for Gibbs sampling
burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE
# Number of topics
k <- 5
# Run LDA using Gibbs sampling
# Class "TopicModel" contains
# call: Object of class "call".
# Dim: Object of class "integer"; number of documents and terms.
# control: Object of class "TopicModelcontrol"; options used for estimating the topic model.
# k: Object of class "integer"; number of topics.
# terms: Vector containing the term names.
# documents: Vector containing the document names.
# beta: Object of class "matrix"; logarithmized parameters of the word distribution for each topic.
# gamma: Object of class "matrix"; parameters of the posterior topic distribution for each document.
# iter: Object of class "integer"; the number of iterations made.
# 12 TopicModel-class
# logLiks: Object of class "numeric"; the vector of kept intermediate log-likelihood values of the
# corpus. See loglikelihood how the log-likelihood is determined.
# n: Object of class "integer"; number of words in the data used.
# wordassignments: Object of class "si
ldaOut <- LDA(dtm, k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
# write out results
# docs to topics
ldaOut.topics <- as.matrix(topics(ldaOut))
write.csv(ldaOut.topics,file=paste("LDAGibbs",k,"DocsToTopics.csv"))
# Top N terms in each topic
ldaOut.terms <- as.matrix(terms(ldaOut,100))
write.csv(ldaOut.terms,file=paste("LDAGibbs",k,"TopicsToTerms.csv"))
# Probabilities associated with each topic assignment
topicProbabilities <- as.data.frame(ldaOut@gamma)
write.csv(topicProbabilities,file=paste("LDAGibbs",k,"TopicProbabilities.csv"))
#Find relative importance of top 2 topics
topic1ToTopic2 <- lapply(1:nrow(dtm),function(x)
sort(topicProbabilities[x,])[k]/sort(topicProbabilities[x,])[k-1])
#Find relative importance of second and third most important topics
topic2ToTopic3 <- lapply(1:nrow(dtm),function(x)
sort(topicProbabilities[x,])[k-1]/sort(topicProbabilities[x,])[k-2])
#write to file
write.csv(topic1ToTopic2,file=paste("LDAGibbs",k,"Topic1ToTopic2.csv"))
write.csv(topic2ToTopic3,file=paste("LDAGibbs",k,"Topic2ToTopic3.csv"))
# create the visualization
ap_topics <- tidy(ldaOut, matrix = "beta")
# ap_top_terms <- ap_topics %>%
# group_by(topic) %>%
# top_n(20, beta) %>%
# ungroup() %>%
# arrange(topic, -beta)
#
# ap_top_terms %>%
# mutate(term = reorder(term, beta)) %>%
# ggplot(aes(term, beta, fill = factor(topic))) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~ topic, scales = "free") +
# coord_flip()
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
write.csv(beta_spread,"beta_spread.csv")
}, error=function(e) {cat("ERROR :", conditionMessage(e), "\n")} )
}