-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathanalyze_speech.py
104 lines (85 loc) · 3.53 KB
/
analyze_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import subprocess
import sys
import ujson as json
import click
from radiotool import composer as C
def speaker_wav(wav_fn, alignment_path, speaker):
basename = ".".join(os.path.basename(wav_fn).split('.')[:-1])
alignment = json.load(open(alignment_path, 'r'))["words"]
c = C.Composition(channels=1)
t = C.Track(wav_fn, speaker)
c.add_track(t)
score_loc = 0.0
start = None
end = None
for word in alignment:
if "speaker" in word:
if word["speaker"] == speaker:
if start is None:
start = word["start"]
end = word["end"]
else:
if start is not None:
seg = C.Segment(t, start, start, end - start)
c.add_segment(seg)
start = None
end = None
if start is not None:
seg = C.Segment(t, start, start, word["end"] - start)
c.add_segment(seg)
out_fn = "static/speechtracks/%s-%s" % (basename, speaker)
c.export(
min_length=t.duration,
filename=out_fn,
channels=1,
filetype='wav',
samplerate=t.samplerate,
separate_tracks=False)
return out_fn + ".wav"
def analyze_speech(mp3_path, text_path, name, force=False):
# text to transcript
transcript_path = os.path.splitext(text_path)[0] + '.transcript'
alignment_path = os.path.splitext(text_path)[0] + '.json'
wav_path = os.path.splitext(mp3_path)[0] + '.wav'
subprocess.call(
"python utilities/transcript_parser.py {} {}".format(
text_path, transcript_path), shell=True)
with open(transcript_path, 'r') as trf:
if len(json.load(trf)) == 0:
speaker_name = raw_input("Enter the name of the speaker: ")
subprocess.call(
"python p2fa-vislab/text_to_transcript.py {} --output-file {} --speaker-name \"{}\"".format(
text_path, transcript_path, speaker_name), shell=True)
# alignment
if not os.path.isfile(alignment_path) or force:
subprocess.call("lame --decode {}".format(mp3_path), shell=True)
os.chdir('p2fa-vislab')
subprocess.call("python align.py ../{} ../{} ../{} --json --phonemes".format(
wav_path, transcript_path, alignment_path), shell=True)
# breath detection
subprocess.call("python detect_breaths.py ../{} ../{}".format(wav_path, alignment_path), shell=True)
os.chdir('..')
# wav2json
speakers = set()
with open(transcript_path, 'r') as trf:
transcript = json.load(trf)
for line in transcript:
speakers.add(line["speaker"])
for speaker in speakers:
speaker_wav_path = speaker_wav(wav_path, alignment_path, speaker)
speaker_waveform_path = os.path.join(
os.path.split(mp3_path)[0], 'wfData/{}-{}.wav.json'.format(name, speaker))
subprocess.call("wav2json -p 2 -s 10000 --channels mid -n -o \"{}\" \"{}\"".format(
speaker_waveform_path, speaker_wav_path), shell=True)
os.remove(speaker_wav_path)
@click.command()
@click.argument('name')
@click.option('--force/--no-force', default=False,
help="Re-align the text/audio even if the alignment already exists")
def click_analyze_speech(name, force):
mp3_path = "static/speechtracks/{}.mp3".format(name)
text_path = "static/speechtracks/{}.txt".format(name)
analyze_speech(mp3_path, text_path, name, force=force)
if __name__ == '__main__':
click_analyze_speech()