-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths2s.py
213 lines (203 loc) · 10.4 KB
/
s2s.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#encoding:utf8
import os
import sys
import math
import time
import numpy as np
import tensorflow as tf
import data_03_utils
import s2s_model
#FLAGS.learning_rate
tf.app.flags.DEFINE_float('learning_rate',0.0003,'学习率')
tf.app.flags.DEFINE_float('max_gradient_norm',5.0,'梯度最大阈值')
tf.app.flags.DEFINE_float('dropout',1.0,'每层输出DROPOUT的大小')#怎么给了1?现在是 测试阶段,不用,所以给了1.
tf.app.flags.DEFINE_integer('batch_size', 32, '小批量梯度下降的批量大小')
tf.app.flags.DEFINE_integer('size',512,'LSTM每层神经元数量')#定义看一下,推荐使用 100-300,把它降下来,以免跑不起来出现 OOM。
tf.app.flags.DEFINE_integer('num_layers',2,'LSTM的层数')
tf.app.flags.DEFINE_integer('num_epoch', 41,'训练几轮')#怎么感觉有问题?嗯,初部估计,以前那个有per
tf.app.flags.DEFINE_integer('num_samples',512,'分批softmax的样本量')#什么意思?和 word2vec意思一样,和前面的512没有任何关系。
tf.app.flags.DEFINE_integer('num_per_epoch',10000,'每轮训练多少随机样本')#这是多少个batch
tf.app.flags.DEFINE_string('buckets_dir','D:\\oo_自然语言处理\\aa_seq2seqModel\\tf_Seq2Seq_Chatbot\\Seq2Seq_chatbot\\bucket_dbs','sqlite3数据库所在文件夹')#“桶”的意思:【问题长度,答案长度】;实例只能都小于,才能放进去;从小到大桶的选择;弊端:破坏上下文情景的连贯性,应用限制在QA类型.
tf.app.flags.DEFINE_string('model_dir','D:\\oo_自然语言处理\\aa_seq2seqModel\\tf_Seq2Seq_Chatbot\\Seq2Seq_chatbot\\model','模型保存的目录')
tf.app.flags.DEFINE_string('model_name','model3','模型保存的名称')
tf.app.flags.DEFINE_boolean('use_fp16',False, '是否使用16位浮点数(默认32位)')
tf.app.flags.DEFINE_integer('bleu', -1,'是否测试bleu')# 机械翻译的标准,Google的准则
tf.app.flags.DEFINE_boolean('test',True,'是否在测试')
tf.app.flags.DEFINE_string('DICTIONARY_PATH',"db\dictionary.json",'加载字典')
tf.app.flags.DEFINE_string('train_device', "/gpu:0",'运行设备')
tf.app.flags.DEFINE_string('test_device', "/cpu:0",'运行设备')
FLAGS = tf.app.flags.FLAGS
# 加载桶信息
buckets = [(5, 15), (10, 20), (15, 25), (20, 30)]
# 加载字典,完成“字-索引”的转换
EOS = '<eos>'
UNK = '<unk>'
PAD = '<pad>'
GO = '<go>'
dictDim, dictionary, index_word, word_index = data_03_utils.load_dictionary(FLAGS.DICTIONARY_PATH, EOS, UNK, PAD, GO)
def create_model(session, forward_only):
dtype = tf.float16 if FLAGS.use_fp16 else tf.float32 #存储空间一定减少,运算速度不一定减少,因为有些数据线硬件(映射量)本身就是32位的
model = s2s_model.S2SModel(dictDim,dictDim,buckets, FLAGS.size, FLAGS.dropout, FLAGS.num_layers,FLAGS.max_gradient_norm,FLAGS.batch_size, FLAGS.learning_rate,
FLAGS.num_samples, forward_only, dtype )
return model
def train():
"""训练模型"""
print('准备数据')
bucket_dbs = data_03_utils.read_bucket_dbs(FLAGS.buckets_dir,buckets)#
bucket_sizes = []
for i in range(len(buckets)):
bucket_size = bucket_dbs[i].size
bucket_sizes.append(bucket_size)
print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
total_size = sum(bucket_sizes)#列表求和
print('共有数据 {} 条'.format(total_size))
# 开始建模与训练
with tf.Session() as sess:
# 构建模型
model = create_model(sess, False)
print("modle is readey!")
# 初始化变量
sess.run(tf.global_variables_initializer())
buckets_scale = [sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes))]#一个元素和为1的列表
# 开始训练
metrics = ' '.join(['\r[{}]','{:.1f}%','{}/{}','loss={:.3f}','{}/{}'])
bars_max = 20
with tf.device('/gpu:0'):
for epoch_index in range(1, FLAGS.num_epoch):
print('Epoch {}:'.format(epoch_index))
time_start = time.time()
epoch_trained = 0 #第一个指标
batch_loss = []#第2个指标
while True:
# 随机选择一个要训练的bucket
random_number = np.random.random_sample()
bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number])
data, _ = model.get_batch_data(bucket_dbs, bucket_id) #随机抽到足量的批数据
encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(bucket_dbs,bucket_id,data,word_index)# 数据转换
_, step_loss, output = model.step(sess,encoder_inputs,decoder_inputs,decoder_weights,bucket_id,False)#预测
batch_loss.append(step_loss)
epoch_trained += FLAGS.batch_size
# 写显示的进度条
time_now = time.time()
time_spend = time_now - time_start
time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch)
percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100
bars = math.floor(percent / 100 * bars_max)
sys.stdout.write(metrics.format( #类似于print
'=' * bars + '-' * (bars_max - bars),
percent,
epoch_trained, FLAGS.num_per_epoch,#话说,‘/’怎么出现的?
np.mean(batch_loss),
data_03_utils.time(time_spend), data_03_utils.time(time_estimate)
))#打印进度条
sys.stdout.flush()
if epoch_trained >= FLAGS.num_per_epoch: #一个epoch就跑 10000个样本,就结束!
break
print('\n')
if not os.path.exists(FLAGS.model_dir):
os.makedirs(FLAGS.model_dir)
if epoch_index%40==0:
model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def test_bleu(count):
"""测试bleu"""
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
# 准备数据
print('准备数据')
bucket_dbs = data_03_utils.read_bucket_dbs(FLAGS.buckets_dir,buckets)
bucket_sizes = []
for i in range(len(buckets)):
bucket_size = bucket_dbs[i].size
bucket_sizes.append(bucket_size)
print('bucket {} 中有数据 {} 条'.format(i, bucket_size))
total_size = sum(bucket_sizes)
print('共有数据 {} 条'.format(total_size))
# bleu设置0的话,默认对所有样本采样
if count <= 0:
count = total_size
buckets_scale = [
sum(bucket_sizes[:i + 1]) / total_size
for i in range(len(bucket_sizes))
]
with tf.Session() as sess:
# 构建模型
model = create_model(sess, True)
model.batch_size = 1
# 初始化变量
sess.run(tf.initialize_all_variables())
model.saver.restore(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
total_score = 0.0
for i in tqdm(range(count)):
# 选择一个要训练的bucket
random_number = np.random.random_sample()
bucket_id = min([
i for i in range(len(buckets_scale))
if buckets_scale[i] > random_number
])
data, _ = model.get_batch_data(bucket_dbs,bucket_id)
encoder_inputs, decoder_inputs, decoder_weights = model.get_batch(bucket_dbs,bucket_id,data,word_index)
_, _, output_logits = model.step(
sess,
encoder_inputs,
decoder_inputs,
decoder_weights,
bucket_id,
True
)
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
ask, _ = data[0]
all_answers = bucket_dbs[bucket_id].all_answers(ask)
ret = data_03_utils.indice_sentence(outputs,index_word)
if not ret:
continue
references = [list(x) for x in all_answers]
score = sentence_bleu(
references,
list(ret),
weights=(1.0,)
)
total_score += score
print('BLUE: {:.2f} in {} samples'.format(total_score / count * 10, count))
def test():
class TestBucket(object):
def __init__(self, sentence):
self.sentence = sentence
def random(self): # 这个地方其实还可以添加功能:把多句话分开为一个列表。
return sentence, ''
with tf.Session() as sess: #主要就三步代码!
# 构建模型,即先放一个空架子。
model = create_model(sess, True)#ture表示 测试过程,仅forward_only
model.batch_size = 1
# 初始化变量
import os
sess.run(tf.global_variables_initializer())
model.saver.restore(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
sys.stdout.write("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()#我干你老妈啊
while sentence:
#获取最小的分桶id
bucket_id = min([ b for b in range(len(buckets)) if buckets[b][0] > len(sentence) ])#长度包括最后的'n',得到1
#输入句子处理.,得到问答对data,__接收的是答问对
data, _ = model.get_batch_data( {bucket_id: TestBucket(sentence)}, bucket_id)
#下一句:文字的ids化,输入句子还被反转了一下。<class 'list'>: [array([2,,]), array([2]), array([2]), array([1])..]
encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( {bucket_id: TestBucket(sentence)}, bucket_id, data,word_index )
#下一句:
_, _, output_logits = model.step(sess,encoder_inputs,decoder_inputs,decoder_weights, bucket_id,True)
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
ret = data_03_utils.indice_sentence(outputs,index_word)
print(ret)#输出答案。
print("> ", end="")
sys.stdout.flush()
sentence = sys.stdin.readline()
def main(self):
if FLAGS.bleu > -1:
test_bleu(FLAGS.bleu)
elif FLAGS.test:
test()
else:
train()
if __name__ == '__main__':
np.random.seed(0)
# tf.set_random_seed(0)
tf.app.run()