-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeechbrain_lid_wrapper.py
193 lines (151 loc) · 6.88 KB
/
speechbrain_lid_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import os
import torch
from sklearn.utils.extmath import softmax
from speechbrain.pretrained import EncoderClassifier
class EncoderClassifierAllLangs(EncoderClassifier):
def save_emb(self, emb, path, emb_ext):
wav_dir, basename = os.path.split(path)
emb_dir = wav_dir.replace('samples_16kHz', 'embeddings_lid')
emb_path = self.get_emb_path(emb_dir, basename, emb_ext)
torch.save(emb, emb_path)
def get_emb_path(self, emb_dir, basename, emb_ext):
root, ext = os.path.splitext(basename)
emb_path = os.path.join(emb_dir, root + emb_ext)
return emb_path
def classify_file(self, path, emb_ext, use_softmax=True):
"""Classifies the given audiofile into the given set of labels.
Arguments
---------
path : str
Path to audio file to classify.
Returns
-------
czech_score
The log posterior probability of class Czech
english_score
The log posterior probability of class English
best_score:
It is the value of the log-posterior for the best class ([batch,])
text_lab:
List with the text labels corresponding to the indexes.
(label encoder should be provided).
"""
waveform = self.load_audio(path)
# Fake a batch:
batch = waveform.unsqueeze(0)
rel_length = torch.tensor([1.0])
emb = self.encode_batch(batch, rel_length)
self.save_emb(emb, path, emb_ext)
out_prob = self.modules.classifier(emb).squeeze(1)
if use_softmax:
out_prob = torch.tensor(softmax(out_prob))
# Get the scores for Czech and Enlish and the best matching language:
czech_score = round(out_prob[0][23].item(), ndigits=4)
english_score = round(out_prob[0][24].item(), ndigits=4)
best_score, index = torch.max(out_prob, dim=-1)
best_score = round(best_score.item(), ndigits=4)
text_lab = self.hparams.label_encoder.decode_torch(index)
return czech_score, english_score, best_score, text_lab
def classify_emb(self, path, use_softmax=True):
"""Classifies the given audiofile into the given set of labels.
Arguments
---------
path : str
Path to embedding file to classify.
Returns
-------
czech_score
The log posterior probability of class Czech
english_score
The log posterior probability of class English
best_score:
It is the value of the log-posterior for the best class ([batch,])
text_lab:
List with the text labels corresponding to the indexes.
(label encoder should be provided).
"""
emb = torch.load(path)
out_prob = self.modules.classifier(emb).squeeze(1)
if use_softmax:
out_prob = torch.tensor(softmax(out_prob))
for x in out_prob[0]:
print(x.item())
czech_score = round(out_prob[0][23].item(), ndigits=4)
english_score = round(out_prob[0][24].item(), ndigits=4)
best_score, index = torch.max(out_prob, dim=-1)
best_score = round(best_score.item(), ndigits=4)
text_lab = self.hparams.label_encoder.decode_torch(index)
return czech_score, english_score, best_score, text_lab
classifier = EncoderClassifierAllLangs.from_hparams(source="speechbrain/lang-id-commonlanguage_ecapa", savedir="pretrained_models/lang-id-commonlanguage_ecapa")
def get_emb_path(emb_dir, basename, emb_ext):
"""Return the joined embedding path"""
root, ext = os.path.splitext(basename)
emb_path = os.path.join(emb_dir, root + emb_ext)
return emb_path
def path(data_dir, file_name):
"""Return the joined full path of file_name"""
return os.path.join(data_dir, file_name)
def ext(file_name: str) -> str:
"""Return the extension of the file specified by file_name"""
return os.path.splitext(file_name)[1]
def get_file_list(data_dir: str, extension: str) -> list:
"""Return a list of file paths in data_dir that match the given extension
Args:
data_dir: path of the data directory
extension: extension for filtering the list of files
Returns: list of files
"""
all_files = os.listdir(data_dir)
file_list = (path(data_dir, f) for f in all_files if ext(f) == extension)
return file_list
def generate_scores_from_wavs(wav_list, emb_ext, use_softmax=True):
"""Generate the score file from a list of audio files
Args:
wav_list: list of absolute paths to audio files
emb_ext: extension of the file for saving the embedding
use_softmax: True if probabilities are to be transformed by softmax
Yields:
Lines with scores for Czech, English, the score for the best-matching
language and its name.
"""
for wav in wav_list:
czech_score, english_score, best_score, text_lab = classifier.classify_file(wav, emb_ext, use_softmax=use_softmax)
basename = os.path.split(wav)[1]
line = '\t'.join([basename, str(czech_score), str(english_score), str(best_score), text_lab[0]])
yield line + '\n'
def generate_scores_from_embs(emb_list, use_softmax=True):
"""Generate the scrore file from a list of embeddings
Args:
emb_list: list of absolute paths to embeddings
use_softmax: True if probabilities are to be transformed by softmax
Yields:
Lines with scores for Czech, English, the score for the best-matching
"""
for emb in emb_list:
czech_score, english_score, best_score, text_lab = classifier.classify_emb(emb, use_softmax=use_softmax)
basename = os.path.split(emb)[1]
line = '\t'.join([basename, str(czech_score), str(english_score), str(best_score), text_lab[0]])
yield line + '\n'
if __name__ == '__main__':
# The script can either compute embeddings from audio files or load them
# from files saved in a prvious run:
LOAD_EMBEDDINGS = 0
# I considered transforming the probabilities with softmax, but then
# decided to use the original output of the EncoderClassifier:
USE_SOFTMAX = False
emb_dir = '~/embeddings_lid'
emb_ext = '.lid'
wav_dir = '~/samples_16kHz'
wav_ext = '.wav'
os.makedirs(emb_dir, exist_ok=True)
if LOAD_EMBEDDINGS == 1: # Load embeddings
emb_list = get_file_list(emb_dir, emb_ext)
score_gen = generate_scores_from_embs(emb_list, use_softmax=USE_SOFTMAX)
score_file = os.path.join(emb_dir, 'speechbrain_lid_from_emb.sco')
else: # Compute embeddings
wav_list = get_file_list(wav_dir, wav_ext)
score_gen = generate_scores_from_wavs(wav_list, emb_ext, use_softmax=USE_SOFTMAX)
score_file = os.path.join(emb_dir, 'speechbrain_lid_from_wav.sco')
with open(score_file, 'w') as out_file:
out_file.writelines(score_gen)
print('Writing scores to file', score_file)