forked from jianchang512/pyvideotrans
-
Notifications
You must be signed in to change notification settings - Fork 1
/
diarization_debug.py
405 lines (321 loc) · 14.7 KB
/
diarization_debug.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
import os
import librosa
import numpy as np
from pyannote.audio import Pipeline, Model, Inference
from pyannote.core import Segment
from pyannote.core import notebook
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.spatial.distance import cosine
import random
import pickle
from videotrans.configure import config
from videotrans.util import tools
from speech_role_debug import get_role
# 指定下载目录
# cache_dir = "F:\\huggingface_cache"
# 使用cached_path确保文件下载到指定目录
# config_url = "https://huggingface.co/pyannote/speaker-diarization-3.0/resolve/main/config.yaml"
# config_path = cached_path(config_url, cache_dir=download_dir)
# 加载说话人分离模型
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token='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')
# 加载embedding提取模型
embedding_model = Model.from_pretrained("pyannote/embedding", use_auth_token='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')
result = [{
'start_time': 0,
'end_time': 0,
'duration': 0,
'speaker_id': 0,
'gender': 1, # 1:男,0:女
'speaker_role': ''}]
def get_speaker_result(audio_file, output_dir=os.getcwd()):
# 运行说话人分割和识别
diarization = pipeline(audio_file)
speakers = set() # 说话人识别到的
roles = set() # 角色的声音相似度匹配到的
result = []
embeddings = []
# 太小了,向量计算的时候,卷积核太少,会报错。如果不用计算embedding,则可以小一点
MIN_DURATION_ON = 0.1
# 导出为RTTM格式
with open(get_plot_path("speaker.rttm"), "w") as f:
for turn, _, speaker in diarization.itertracks(yield_label=True):
record = {}
start_time = turn.start
duration = turn.duration
if(duration < MIN_DURATION_ON) :
# 100毫秒以内的,忽略
continue
segment = Segment(turn.start, turn.end)
embedding = get_embedding(audio_file, segment)
role = get_role(embedding)
# embeddings.append((start_time, embedding))
# gender = get_gender_from_segment(audio_file, segment)
end_time = start_time+ duration
speakers.add(speaker)
roles.add(role)
start_time_milliseconds = int(start_time * 1000)
end_time_milliseconds = int(end_time * 1000)
record['start_time'] = start_time_milliseconds
record['end_time'] = end_time_milliseconds
record['duration'] = duration
record['speaker_id'] = speaker
record['speaker_role'] = role
result.append(record)
data = f"SPEAKER 1 {start_time:.3f} {duration:.3f} {role} {speaker}\n"
print(data)
f.write(data)
tools.set_process(f"识别到说话人数量:{len(speakers)}")
tools.set_process(f"匹配到的配音角色数量:{len(roles)}")
show_speaker_plot(diarization, output_dir)
# show_embedding_similar(embeddings)
return result
def get_speaker_count(speaker_result):
return len(set(item['speaker_id'] for item in speaker_result))
# 说话人识别的信息图形化展示
def show_speaker_plot(diarization, output_dir):
# 绘制
fig = notebook.plot_annotation(diarization, time=True)
plt.savefig(get_plot_path("speaker_plot.png"))
# 都存到plot目录中
def get_plot_path(file_name):
if not os.path.exists(os.getcwd()+'/plot_debug'):
os.makedirs(os.getcwd()+'/plot_debug')
return os.path.join(os.getcwd(), 'plot_debug', file_name)
# 把字幕文件图形化展示
def show_subtitle_plot(sub_list, output_dir=os.getcwd()):
# 解析数据,准备绘图
times = []
durations = []
texts = []
colors = ['blue', 'green', 'red', 'purple', 'orange'] # 颜色列表,可扩展
for i, it in enumerate(sub_list):
times.append(it['start_time']) # 假设时间是以秒为单位的元组,这里取开始秒数
durations.append(it['end_time'] - it['start_time'])
texts.append(it['line'])
# 绘制
fig, ax = plt.subplots(figsize=(10, 2))
for i, (time, duration, text) in enumerate(zip(times, durations, texts)):
ax.add_patch(mpatches.Rectangle((time, 0), duration, 1, facecolor=colors[i % len(colors)]))
# 添加标签
ax.text(time + duration / 2, 0.8, text, ha='center', va='center')
ax.set_yticks([]) # 隐藏y轴刻度
ax.set_xlabel('Time (seconds)')
plt.title('Subtitle Visualization')
plt.xlim(left=min(times), right=max(times+durations))
# plt.show()
plt.savefig(get_plot_path("subtitle_plot.png"))
def show_line_role_plot(line_roles, output_dir=os.getcwd()):
# 解析数据,准备绘图
times = []
durations = []
texts = []
colors = ['blue', 'green', 'red', 'purple', 'orange'] # 颜色列表
speaker_set = set()
for key in line_roles.keys():
if(not isinstance(key, str)):
continue
it = line_roles[key]
times.append(it['start_time']) # 取开始秒数
durations.append(it['end_time'] - it['start_time'])
texts.append(it['speaker_id'])
speaker_set.add(it['speaker_id'])
speaker_list = list(speaker_set)
# 绘制
fig, ax = plt.subplots(figsize=(10, 2))
for i, (time, duration, text) in enumerate(zip(times, durations, texts)):
# 在list中寻找speaker_id的位置
spk_index = speaker_list.index(text)
spk_index = spk_index if spk_index >= 0 else 0
ax.add_patch(mpatches.Rectangle((time, 0), duration, 1, facecolor=colors[spk_index]))
# 添加标签
ax.text(time + duration / 2, 0.8, text.split('_')[-1], ha='center', va='center')
ax.set_yticks([]) # 隐藏y轴刻度
ax.set_xlabel('Time (seconds)')
plt.title('Linerole Visualization')
plt.xlim(left=min(times), right=max(times+durations))
# plt.show()
plt.savefig(get_plot_path("line_role_plot.png"))
def show_line_role_plot_new(line_roles, output_dir=os.getcwd()):
# 解析数据,准备绘图
times = []
durations = []
texts = []
colors = ['blue', 'green', 'red', 'purple', 'orange'] # 颜色列表
speaker_set = set()
for key in line_roles.keys():
if(not isinstance(key, str)):
continue
it = line_roles[key]
times.append(it['start_time']) # 取开始秒数
durations.append(it['end_time'] - it['start_time'])
texts.append(it['speaker_role'])
speaker_set.add(it['speaker_role'])
speaker_list = list(speaker_set)
# 绘制
fig, ax = plt.subplots(figsize=(10, 2))
for i, (time, duration, text) in enumerate(zip(times, durations, texts)):
# 在list中寻找speaker_id的位置
spk_index = speaker_list.index(text)
spk_index = spk_index if spk_index >= 0 else 0
ax.add_patch(mpatches.Rectangle((time, 0), duration, 1, facecolor=colors[spk_index%len(colors)]))
# 添加标签
ax.text(time + duration / 2, 0.8, text.split('_')[-1], ha='center', va='center')
ax.set_yticks([]) # 隐藏y轴刻度
ax.set_xlabel('Time (seconds)')
plt.title('Linerole Visualization')
plt.xlim(left=min(times), right=max(times+durations))
# plt.show()
plt.savefig(get_plot_path("line_role_plot_new.png"))
# 定义一个函数来提取基频并判断性别
def get_gender_from_segment(audio_file, segment):
y, sr = librosa.load(audio_file, sr=None, offset=segment.start, duration=segment.duration)
f0, voiced_flag, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
mean_f0 = np.mean(f0[voiced_flag])
male_threshold = 160
if mean_f0 > male_threshold:
return 0
return 1
def aggregate_by_speaker(result):
if not result: # 如果列表为空,直接返回
return []
aggregated_result = []
current_group = result[0] # 初始化当前聚合组为第一个元素
line = 1 #字幕的行从1开始,这里也从1开始
tools.set_process(f"说话人聚合")
for item in result[1:]:
# 如果当前项的speaker_id与聚合组的不同,则结束当前组并开始新的组
if item['speaker_id'] != current_group['speaker_id']:
aggregated_result.append(current_group) # 将当前组添加到结果列表
# 开始新的聚合组
current_group = item
# tools.set_process(f"{line} {current_group['speaker_id']} :{current_group['start_time']}->{current_group['end_time']}")
line += 1
else: # 如果speaker_id相同,则更新当前组的end_time
# 更新end_time
current_group['end_time'] = item['end_time']
# 累加duration,只是说话的duration,期间的空白没有计入
current_group['duration'] += item['duration']
# 添加最后一个聚合组到结果列表
aggregated_result.append(current_group)
for index, group in enumerate(aggregated_result):
tools.set_process(f"{index+1} {group['speaker_id']} :{group['start_time']}->{group['end_time']}")
return aggregated_result
def define_line_roles(sub_list, speaker_result, role_list, default_role=None):
# 两个参数都不能为空
if sub_list is None or speaker_result is None or not role_list:
return
# line_roles定义为一个字典
line_roles = {}
# 最多允许的间隔时间
max_gap = 200
# 先聚合一下
speaker_result = aggregate_by_speaker(speaker_result)
# 获取所有不同的说话人ID
unique_speakers = set(item['speaker_id'] for item in speaker_result)
# 创建一个说话人ID到角色的随机映射
speaker_to_role_mapping = {}
for speaker in unique_speakers:
# 从role_list中随机选择一个角色分配给说话人
while True:
role = random.choice(list(role_list['en']))
if role != "No":
break
speaker_to_role_mapping[speaker] = fit_edge_role(role, default_role)
# speaker_result列表的索引
speaker_index = 0
tools.set_process("字幕匹配说话人")
for it in sub_list:
# 要么用当前匹配上的角色,要么用上一个匹配上的角色
sub_start_time,sub_end_time,line = it['start_time'], it['end_time'], it['line']
speaker_obj = speaker_result[speaker_index]
speaker_start_time, speaker_end_time, speaker_id = speaker_obj['start_time'], speaker_obj['end_time'], speaker_obj['speaker_id']
# 当前的字幕匹配角色
speaker_id = speaker_result[speaker_index]['speaker_id']
role = speaker_to_role_mapping.get(speaker_id, default_role)
line_roles[line] = role
# 额外信息,用来showplot
line_role_obj = {}
line_role_obj['role'] = role
line_role_obj['speaker_id'] = speaker_id
line_role_obj['start_time'] = sub_start_time
line_role_obj['end_time'] = sub_end_time
line_roles[str(line)] = line_role_obj
tools.set_process(f"{line} {role} :{sub_start_time}->{sub_end_time}")
if(sub_end_time > speaker_end_time - max_gap):
# 需要移动索引的情况
speaker_index += 1
# if(speaker_start_time - max_gap <= sub_start_time <= sub_end_time <= speaker_end_time + max_gap):
# # 字幕在配音时间范围内
# continue
# elif sub_end_time > speaker_end_time + max_gap:
# # end还在范围内
# continue
# else:
# # 超出范围了,移动索引,看下一个
# speaker_index += 1
# if sub_start_time - speaker_start_time <= max_gap or speaker_start_time - sub_start_time <= max_gap :
# # 起点接近,成功匹配,speaker移动到下一个
# speaker_index += 1
# else:
# # 说明说话人的起点不在当前字幕的附近,那就看终点
# if speaker_end_time - sub_end_time <= max_gap :
# # 终点没超过字幕太多,成功匹配,speaker移动到下一个
# speaker_index += 1
# else:
# # 终点超过字幕太多,说明这个说话人没有匹配上,那就跳过
# continue
return line_roles
def define_line_roles(sub_list, audio_file):
# 两个参数都不能为空
if sub_list is None:
return
# line_roles定义为一个字典
line_roles = {}
tools.set_process("字幕匹配说话人")
for it in sub_list:
# 要么用当前匹配上的角色,要么用上一个匹配上的角色
sub_start_time,sub_end_time,line = it['start_time'], it['end_time'], it['line']
segment = Segment(sub_start_time/1000, sub_end_time/1000)
embedding = get_embedding(audio_file, segment)
role = get_role(embedding)
line_roles[line] = role
# 额外信息,用来showplot
line_role_obj = {}
line_role_obj['speaker_role'] = role
line_role_obj['start_time'] = sub_start_time
line_role_obj['end_time'] = sub_end_time
line_roles[str(line)] = line_role_obj
tools.set_process(f"{line} {role} :{sub_start_time}->{sub_end_time}")
return line_roles
def fit_edge_role(role_item, default_role):
return role_item if role_item != "No" else default_role
# return role_item[1] if role_item is not None else default_role
# 提取音频片段的embedding
def get_embedding(audio_path, segment):
inference = Inference(embedding_model, window="whole")
embedding = inference.crop(audio_path, segment)
return embedding
# 计算余弦相似度
def cosine_similarity(embedding1, embedding2):
return 1 - cosine(embedding1, embedding2)
def show_embedding_similar(embeddings):
# 计算说话人之间的嵌入相似度并输出
for i in range(len(embeddings)):
for j in range(i + 1, len(embeddings)):
start_time1, embedding1 = embeddings[i]
start_time2, embedding2 = embeddings[j]
similarity = cosine_similarity(embedding1, embedding2)
print(f"Similarity between {start_time1} and {start_time2}: {similarity}")
if __name__ == '__main__':
config.params['detail_log'] = True
# 加载音频文件
audio_file = "F:\\Project\\test\\101\\vocal.wav"
pickle_file = "F:\\Project\\test\\101\\subs_data.pickle"
# sr = get_speaker_result(audio_file)
with open(pickle_file, 'rb') as handle:
subs = pickle.load(handle)
# role_list = tools.get_edge_rolelist()
line_roles = define_line_roles(subs, audio_file)
show_subtitle_plot(subs)
show_line_role_plot_new(line_roles)