-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCorpus.py
132 lines (104 loc) · 3.86 KB
/
Corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from utils import normalize
import random
import numpy as np
class Corpus(object):
'''
A collection of documents.
'''
def __init__(self):
'''
Initialize empty document list.
'''
self.m = 1.1
self.kmeans_set = []
self.documents = []
def add_document(self, document):
'''
Add a document to the corpus.
'''
document.split([])
self.documents.append(document)
def build_vocabulary(self):
'''
Construct a list of unique words in the corpus.
'''
discrete_set = set()
for document in self.documents:
for word in document.words:
discrete_set.add(word)
self.vocabulary = list(discrete_set)
def count_term_doc_matrix(self):
'''
Model topics.
'''
# Get vocabulary and number of documents.
self.build_vocabulary()
number_of_documents = len(self.documents)
vocabulary_size = len(self.vocabulary)
# build term-doc matrix
self.term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype=np.double)
for d_index, doc in enumerate(self.documents):
term_count = np.zeros(vocabulary_size, dtype=np.double)
for word in doc.words:
if word in self.vocabulary:
w_index = self.vocabulary.index(word)
term_count[w_index] = term_count[w_index] + 1
self.term_doc_matrix[d_index] = term_count
def kmeans(self, k):
self.k = k
for i in range(k):
self.kmeans_set.append(2 * np.random.rand(1, len(self.vocabulary)))
self.u = np.random.rand(len(self.documents), k)
for k in self.kmeans_set:
print(k)
for i in range(700):
self.update_u()
self.update_c()
self.show_result_clusters()
def update_u(self):
for k in range(self.k):
for i in range(len(self.documents)):
self.u[i][k] = 1 / self.Edcm(self.term_doc_matrix[i], k)
def Edcm(self, document, i):
suma = 0.0
for k in range(self.k):
suma +=((np.linalg.norm(document - self.kmeans_set[i])
/ (np.linalg.norm(document - self.kmeans_set[k]))) ** (2 / (self.m - 1)))
return suma
def update_c(self):
for ck in range(len(self.kmeans_set)):
for i in range(len(self.documents)):
self.kmeans_set[ck] = self.Euk(ck) / self.Eum(ck)
pass
def Euk(self, k):
suma = 0
for i in range(len(self.documents)):
suma += (self.u[i][k] ** self.m) * self.term_doc_matrix[i]
return suma
def Eum(self, k):
suma = 0.0
for i in range(len(self.documents)):
suma += self.u[i][k] ** self.m
return suma
def get_cluster_index(self, document):
min_k = 0
for k in range(len(self.kmeans_set)):
if np.linalg.norm(self.kmeans_set[k] - document) < np.linalg.norm(self.kmeans_set[min_k] - document):
min_k = k
return min_k
def show_result_clusters(self):
result = []
for doc in range(len(self.term_doc_matrix)):
result.append([self.documents[doc].filepath, self.get_cluster_index(self.term_doc_matrix[doc]) ])
sort_result = sorted( result, key=lambda x: x[1])
for i in sort_result:
print(i)
for cluster in self.kmeans_set:
self.get_most_important_word_in_cluster(cluster)
def get_most_important_word_in_cluster(self, cluster):
cluster_most_important_list = []
for i in range(10):
best_word = cluster.argmax()
cluster_most_important_list.append(self.vocabulary[best_word])
cluster[best_word] = 0
print(cluster_most_important_list)