forked from aliostad/WikiRockWord2Vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathartist_clustering.py
43 lines (34 loc) · 932 Bytes
/
artist_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from __future__ import unicode_literals
import gensim
import codecs
from scipy import cluster
import time
import numpy as np
def load_model():
return gensim.models.Word2Vec.load('data/rock_music.w2v')
def get_all_artists():
return {}.fromkeys(map(lambda x: x.strip(), codecs.open('data/all_artists.txt'))).keys()
def build_vectors(model):
vecs = []
names = []
for a in get_all_artists():
if a in model:
vecs.append(model[a])
names.append(a)
return vecs, names
def build_clusters():
model = load_model()
data, names = build_vectors(model)
a, b = cluster.vq.kmeans2(np.array(data), 300)
clusters = {}
for i in range(0, len(b)):
clusNo = b[i]
artist = names[i]
if clusNo not in clusters:
clusters[clusNo] = []
clusters[clusNo].append(artist)
return clusters
def report_clusters(clusters):
for c in clusters:
print(', '.join(clusters[c]))
print('\n\n\n')