-
Notifications
You must be signed in to change notification settings - Fork 91
/
Copy pathkeras_item2vec.py
171 lines (156 loc) · 6.97 KB
/
keras_item2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import keras
from keras.models import Sequential, Model
from keras.layers import Activation, Merge, Reshape
from keras.layers import Input, Embedding, Dense, dot
from keras.layers.core import Lambda
from keras import optimizers
from keras import backend as K
import numpy as np
import random
import utils.process as process
from utils.log_tool import data_process_logger as logger
def skipgram_model(vocab_size, embedding_dim=100, paradigm='Functional'):
# Sequential paradigm
if paradigm == 'Sequential':
target = Sequential()
target.add(Embedding(vocab_size, embedding_dim, input_length=1))
context = Sequential()
context.add(Embedding(vocab_size, embedding_dim, input_length=1))
# merge the pivot and context models
model = Sequential()
model.add(Merge([target, context], mode='dot'))
model.add(Reshape((1,), input_shape=(1,1)))
model.add(Activation('sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
return model
# Functional paradigm
elif paradigm == 'Functional':
target = Input(shape=(1,), name='target')
context = Input(shape=(1,), name='context')
#print target.shape, context.shape
shared_embedding = Embedding(vocab_size, embedding_dim, input_length=1, name='shared_embedding')
embedding_target = shared_embedding(target)
embedding_context = shared_embedding(context)
#print embedding_target.shape, embedding_context.shape
merged_vector = dot([embedding_target, embedding_context], axes=-1)
reshaped_vector = Reshape((1,), input_shape=(1,1))(merged_vector)
#print merged_vector.shape
prediction = Dense(1, input_shape=(1,), activation='sigmoid')(reshaped_vector)
#print prediction.shape
model = Model(inputs=[target, context], outputs=prediction)
model.compile(optimizer='adam', loss='binary_crossentropy')
return model
else:
print('paradigm error')
return None
def skipgram_reader_generator(movie_dict, file_name=process.DoulistCorpusNameFile, context_window=2):
def reader():
vocabulary_size = len(movie_dict)
with open(file_name) as fopen:
for line in fopen:
line_list = line.strip().split('\t')
movie_ids = [movie_dict.get(_, movie_dict['<unk>']) for _ in line_list]
for i in range(len(movie_ids)):
target = movie_ids[i]
# generate positive sample
context_list = []
j = i - context_window
while j <= i + context_window and j < len(movie_ids):
if j >= 0 and j != i:
context_list.append(movie_ids[j])
yield ((target, movie_ids[j]), 1)
j += 1
# generate negative sample
for _ in range(len(context_list)):
ne_idx = random.randrange(0, vocabulary_size)
while ne_idx in context_list:
ne_idx = random.randrange(0, vocabulary_size)
yield ((target, ne_idx), 0)
return reader
def cbow_base_model(dict_size, emb_size=100, context_window_size=4):
model = keras.models.Sequential()
model.add(Embedding(dict_size, emb_size,
input_length=context_window_size,
embeddings_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.2),
))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(emb_size,)))
model.add(Dense(dict_size))
model.add(Activation('softmax')) # TODO: use nce
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd,
loss='categorical_crossentropy',)
return model
def train_cbow_base_model():
min_word_freq = 5
word_dict = process.get_movie_name_id_dict(min_word_freq=min_word_freq)
dict_size = len(word_dict)
emb_size = 100
context_window_size = 4
epochs = 20
batch_size = 128
model = cbow_base_model(dict_size, emb_size, context_window_size)
for epoch_id in xrange(epochs):
# train by batch
batch_id = 0
x_batch = []
y_batch = []
for movie_ids in process.shuffle(process.reader_creator(word_dict, ngram=context_window_size+1), 10000)():
batch_id += 1
if batch_id % (batch_size*50) == 0:
# Print evaluate log
score = model.evaluate(np.array(x_batch),
keras.utils.to_categorical(y_batch, num_classes=dict_size))
logger.info('[epoch #%d] batch #%d, train loss:%s' % (epoch_id, batch_id, score))
if batch_id % batch_size == 0:
# Convert labels to categorical one-hot encoding
model.train_on_batch(np.array(x_batch),
keras.utils.to_categorical(y_batch, num_classes=dict_size))
x_batch = []
y_batch = []
x = np.array(movie_ids[:context_window_size])
y = movie_ids[-1]
x_batch.append(x)
y_batch.append(y)
logger.info('model train done')
# store word embedding
with open('./models/keras_0804_09_cbow', 'w') as fwrite:
for idx, vec in enumerate(model.layers[0].get_weights()[0].tolist()):
fwrite.write('%d %s\n' % (idx, ' '.join([str(_) for _ in vec])))
if __name__ == '__main__':
# network conf
paradigm = 'Functional'
min_word_freq = 10
word_dict = process.get_movie_name_id_dict(min_word_freq=min_word_freq)
dict_size = len(word_dict)
emb_size = 100
context_window_size = 2
epochs = 50
batch_size = 256
model = skipgram_model(dict_size, emb_size, paradigm)
#print model.layers
for epoch_id in xrange(epochs):
# train by batch
batch_id = 0
x_batch = [[],[]]
y_batch = []
loss_list = []
for movie_ids, label in process.shuffle(skipgram_reader_generator(word_dict, context_window=context_window_size), 10000)():
batch_id += 1
x_batch[0].append(movie_ids[0])
x_batch[1].append(movie_ids[1])
y_batch.append(label)
if batch_id % (batch_size*1000) == 0:
# Print evaluate log
logger.info('[epoch #%d] batch #%d, train loss:%s' % (epoch_id, batch_id, np.mean(loss_list)))
loss_list = []
if batch_id % batch_size == 0:
X = [np.array(x_batch[0]), np.array(x_batch[1])]
loss = model.train_on_batch(X, np.array(y_batch))
loss_list.append(loss)
x_batch = [[],[]]
y_batch = []
logger.info('model train done')
# store word embedding
with open('./models/keras_0804_09_skipgram', 'w') as fwrite:
for idx, vec in enumerate(model.layers[2].get_weights()[0].tolist()):
fwrite.write('%d %s\n' % (idx, ' '.join([str(_) for _ in vec])))