-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_transcriptions_orig.py
75 lines (66 loc) · 2.49 KB
/
get_transcriptions_orig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import re
import os
import pandas as pd
allTranscripts = open('data/words_94.mlf')
all_transcripts_94 = allTranscripts.read()
all_transcripts_94 = all_transcripts_94.split('\n')
allTranscripts_ = open('data/words_97.mlf')
all_transcripts_97 = allTranscripts_.read()
all_transcripts_97 = all_transcripts_97.split('\n')
allTranscripts_ = open('data/words_226.mlf')
all_transcripts_226 = allTranscripts_.read()
all_transcripts_226 = all_transcripts_226.split('\n')
##get transcription for every audio file.
def per_audio_transcript(fileid,module_id):
trans =[]
flag = 0
all_transcripts=[]
if (int(module_id) == 94): ##checks the module. Should be either 94/97. If not, return empty trans.
all_transcripts = all_transcripts_94
elif (int(module_id) == 97):
all_transcripts = all_transcripts_97
else:
if (int(module_id) == 226):
all_transcripts = all_transcripts_226
for t in all_transcripts:
if (flag==0):
m = re.search(fileid,t)
if (m is not None):
flag = 1
else:
if (t!='.'):
trans.append(t)
else:
flag = 0
trans = trans[1:-1]
trans = " ".join(trans)
return trans
##gives the csv dump (candidate id, audio name, transcription)
def transcript_dump(full_folder_path):
candidate_id = []
file_tag = []
transcript=[]
for path,dirs,files in os.walk(full_folder_path):
for d in dirs:
print(d)
dir_name = full_folder_path+"/"+d
print(dir_name)
for f in os.listdir(dir_name):
if (f.endswith(".wav")):
file_id = f[:-3]
temp_module = file_id.split("_")
module_id = temp_module[1]
pattern = re.compile('.'+file_id+'.')
string='"*/'+file_id+'lab"'
trans = per_audio_transcript(string,module_id)
if (trans!=''):
candidate_id.append(d)
file_tag.append(f)
transcript.append(trans)
else:
print("candidate_id: " + d + ", audio file: " + f + ": Not found : Free speech")
all_details = pd.DataFrame({'candidate_id':candidate_id,'file_tag':file_tag,'transcript':transcript})
all_details.to_csv('sample.csv')
##change this.
###give full path to the folder which has all candidates.
transcript_dump('data/rslr-test')