forked from PaddlePaddle/PaddleSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduce_data_dir.sh
executable file
·59 lines (47 loc) · 2.22 KB
/
reduce_data_dir.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env bash
# koried, 10/29/2012
# Reduce a data set based on a list of turn-ids
help_message="usage: $0 srcdir turnlist destdir"
if [ $1 == "--help" ]; then
echo "${help_message}"
exit 0;
fi
if [ $# != 3 ]; then
echo "${help_message}"
exit 1;
fi
srcdir=$1
reclist=$2
destdir=$3
if [ ! -f ${srcdir}/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
[ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp
[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp
[ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text
[ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames
[ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender
[ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp
if [ -f ${srcdir}/segments ]; then
utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments
awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings.
# The next line would override the command above for wav.scp, which would be incorrect.
[ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp
[ -f ${srcdir}/reco2file_and_channel ] && \
utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel
# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
[ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm
rm ${destdir}/reco
fi
srcutts=$(wc -l < ${srcdir}/utt2spk)
destutts=$(wc -l < ${destdir}/utt2spk)
echo "Reduced #utt from $srcutts to $destutts"
}
mkdir -p ${destdir}
# filter the utt2spk based on the set of recordings
utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk
utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt
do_filtering;