From 3dc3482c925346924ba911dd1add1998a045bbda Mon Sep 17 00:00:00 2001 From: Mithun Das <53312122+das-mithun@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:05:26 +0530 Subject: [PATCH] Update README.md --- Codes/README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Codes/README.md b/Codes/README.md index f72eed0..597d734 100644 --- a/Codes/README.md +++ b/Codes/README.md @@ -22,7 +22,7 @@ # To Run the unimodal Vision Based models 6.Vision+lstm_foldWise.py -7. 3DCNN_withFolds.py +7.3DCNN_withFolds.py # To Run the Multimodal Model @@ -30,3 +30,15 @@ # To extract all the video frames. frameExtract.py + +# Extraction of transcript + +The 'all__video_vosk_audioMap.p' has to be generated using the Vosk speech recognition toolkit(https://alphacephei.com/vosk/). The format of the file is in JSON format like the below: + +{ + "video_name1": "transcript1", + "video_name2": "transcript2", + ... + "video_name3": "transcript3" +} +