-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTopicModel_multilingual_TXTfiles.py
126 lines (83 loc) · 3.86 KB
/
TopicModel_multilingual_TXTfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# script for topic modelling based on Scikit Learn and NLTK stopword list
# adapted from DARIAH-DE tutorial
# import necessary modules
import os
import numpy as np
import sklearn.feature_extraction.text as text # submodule gathers utilities to build feature vectors from text documents
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# call custom NLTK stopword list, stored locally in nltk-data folder
my_stopwords=stopwords.words('en_fr_de')
# extend list if necessary
my_stopwords.extend(['@', 'forschungsbereich', 'frage'])
print(my_stopwords)
# read .txt files to be analysed from directory
CORPUS_PATH=("C:\\Users\\mobarget\\Google Drive\\ACADEMIA\\FBIII_OCRTexts")
filenames=sorted([os.path.join(CORPUS_PATH, filename) for filename in os.listdir(CORPUS_PATH)])
print(len(filenames)) # count files in corpus
print(filenames[:10]) # print names of 1st ten files in corpus
# apply stopword list and vectorise data
vectorizer=text.CountVectorizer(input='filename', stop_words=my_stopwords, min_df=0.1)
# min_df: float in range [0.0, 1.0] or int, default=1
# When building the vocabulary ignore terms that have a document frequency
# strictly lower than the given threshold. This value is also called cut-off
# in the literature. If float, the parameter represents a proportion of documents,
# integer absolute counts. This parameter is ignored if vocabulary is not "None".
# define document term matrix
dtm=vectorizer.fit_transform(filenames).toarray()
vocab=np.array(vectorizer.get_feature_names())
print(dtm.shape)
# vocabulary size of corpus
print(len(vocab))
# generate topics and select representative words
num_topics=6 # no. of topics
num_top_words=30 # no. of words per topic to be displayed
clf=decomposition.NMF(n_components=num_topics, random_state=1)
# random_state : int
# RandomState instance or None, optional (default=None)
# If int, random_state is the seed used by the random number generator;
# If RandomState instance, random_state is the random number generator;
# If None, the random number generator is the RandomState instance used by np.random.
doctopic=clf.fit_transform(dtm)
topic_words=[] # list of most prominent words associated with topics
for topic in clf.components_:
word_idx=np.argsort(topic)[::-1][0:num_top_words]
topic_words.append([vocab[i] for i in word_idx])
# output results
print(topic_words)
doctopic=doctopic/np.sum(doctopic,axis=1,keepdims=True)
# names of texts associated with each topic (according to topic shares in MALLET)
text_names=[]
for fn in filenames:
basename=os.path.basename(fn)
name, ext=os.path.splitext(basename)
name=name.rstrip('0123456789')
text_names.append(name)
text_names=np.asarray(text_names) # turn into array to use NumPy function
# list all file names in corpus
print(text_names)
doctopic_orig=doctopic.copy()
print(doctopic_orig)
num_groups=len(set(text_names))
print(num_groups)
doctopic_grouped=np.zeros((num_groups, num_topics))
for i, name in enumerate(sorted(set(text_names))):
doctopic_grouped[i,:]=np.mean(doctopic[text_names==name,:],axis=0)
doctopic=doctopic_grouped
texts=sorted(set(text_names))
# show top three topics per text in corpus
print("Top NMF topics in ...")
for i in range(len(doctopic)):
# defines number of topics per text
top_topics=np.argsort(doctopic[i,:])[::-1][0:3]
top_topics_str=' '.join(str(t)for t in top_topics)
print("{}:{}".format(texts[i],top_topics_str))
# show top words
for t in range(len(topic_words)):
# defines max. number of words displayed in output
print("Topic{}:{}".format(t, ' '.join(topic_words[t][:31])))
# Print "done" to indicate successful completion
print("done")