-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda1.py
122 lines (78 loc) · 2.62 KB
/
lda1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pymysql
from nltk.corpus import stopwords
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
ts = time.time()
print ts
import datetime
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print st
conn=pymysql.connect(host='127.0.0.1',user='root', passwd='root',port=8889,db='AlgebraNationWall')
a = conn.cursor()
sql = 'SELECT id, postGroup FROM PostUnlabelled;' # LIMIT 1000
a.execute(sql)
doclist = []
try:
for row in a:
sentence = row[1]
#print sentence
#sentence = sentence.encode('ascii', 'ignore')
#sentence = unicode(sentence, 'utf-8')
sentence = sentence.decode('utf-8', 'ignore')
sentence = sentence.decode('windows-1252')
sentence = sentence.replace('\n', ' ').replace('\r', '')
#print sentence
sentence = sentence.encode('utf-8', 'ignore')
doclist.append(sentence)
except Exception as e:
raise
print(e)
else:
pass
finally:
pass
print row[0]
#print doclist
doc_complete = doclist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# Importing Gensim
import gensim
from gensim import corpora
stop = set(stopwords.words('english'))
#exclude = set(string.punctuation)
#lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
#punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
#normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return stop_free
doc_clean = [clean(doc).split() for doc in doc_complete]
#print doc_clean
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)
# ignore words that appear in less than 2 documents or more than 10% documents
dictionary.filter_extremes(no_below=3)
#print(id2word_wiki)
print 'doc2bow started'
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
#print doc_term_matrix
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
print 'lda training started'
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=25, id2word = dictionary, passes=15)
print(ldamodel.print_topics(num_topics=25, num_words=20))
import time
ts2 = time.time()
timetaken= ts2-ts
import datetime
st = datetime.datetime.fromtimestamp(ts2).strftime('%Y-%m-%d %H:%M:%S')
print st
print '\n\nTime taken to run: '
st = datetime.datetime.fromtimestamp(timetaken).strftime('%Y-%m-%d %H:%M:%S')
print st