Skip to content

Commit

Permalink
Translation: accept multiple language code formats. Refactor timeline…
Browse files Browse the repository at this point in the history
… processing. Fix issue with whisper.cpp not getting word offsets.
  • Loading branch information
rotemdan committed May 2, 2024
1 parent 61c1823 commit e0b3075
Showing 1 changed file with 26 additions and 21 deletions.
47 changes: 26 additions & 21 deletions src/api/Translation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { Logger } from '../utilities/Logger.js'

import { Timeline, addWordTextOffsetsToTimeline, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js'
import { type WhisperOptions } from '../recognition/WhisperSTT.js'
import { formatLanguageCodeWithName, getShortLanguageCode, normalizeLanguageCode } from '../utilities/Locale.js'
import { formatLanguageCodeWithName, getShortLanguageCode, normalizeIdentifierToLangaugeCode, parseLangIdentifier } from '../utilities/Locale.js'
import { EngineMetadata } from './Common.js'
import { type SpeechLanguageDetectionOptions, detectSpeechLanguage } from './API.js'
import chalk from 'chalk'
Expand Down Expand Up @@ -62,28 +62,31 @@ export async function translateSpeech(input: AudioSourceParam, options: SpeechTr
sourceRawAudio = normalizeAudioLevel(sourceRawAudio)
sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0])

if (!options.sourceLanguage) {
logger.start('No source language specified. Detecting speech language')
const { detectedLanguage } = await detectSpeechLanguage(sourceRawAudio, options.languageDetection || {})
if (options.sourceLanguage) {
const languageData = await parseLangIdentifier(options.sourceLanguage)

logger.end()
logger.logTitledMessage('Source language detected', formatLanguageCodeWithName(detectedLanguage))
options.sourceLanguage = languageData.Name

options.sourceLanguage = detectedLanguage
} else {
logger.end()
logger.logTitledMessage('Source language specified', formatLanguageCodeWithName(options.sourceLanguage))
} else {
logger.start('No source language specified. Detecting speech language')
const { detectedLanguage } = await detectSpeechLanguage(sourceRawAudio, options.languageDetection || {})

const specifiedLanguageFormatted = formatLanguageCodeWithName(getShortLanguageCode(normalizeLanguageCode(options.sourceLanguage)))
options.sourceLanguage = detectedLanguage

logger.logTitledMessage('Source language', specifiedLanguageFormatted)
logger.end()
logger.logTitledMessage('Source language detected', formatLanguageCodeWithName(detectedLanguage))
}

logger.logTitledMessage('Target language', formatLanguageCodeWithName(getShortLanguageCode(normalizeLanguageCode(options.targetLanguage!))))
options.targetLanguage = await normalizeIdentifierToLangaugeCode(options.targetLanguage!)

logger.logTitledMessage('Target language', formatLanguageCodeWithName(options.targetLanguage))

logger.start('Preprocess audio for translation')

const engine = options.engine!
const sourceLanguage = normalizeLanguageCode(options.sourceLanguage!)
const sourceLanguage = options.sourceLanguage!
const targetLanguage = options.targetLanguage!

let transcript: string
Expand Down Expand Up @@ -119,10 +122,6 @@ export async function translateSpeech(input: AudioSourceParam, options: SpeechTr

({ transcript, timeline: wordTimeline } = await WhisperSTT.recognize(sourceRawAudio, modelName, modelDir, 'translate', sourceLanguage, whisperOptions))

addWordTextOffsetsToTimeline(wordTimeline, transcript);

({ segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(wordTimeline, transcript, targetLanguage, 'single', 'preserve'))

break
}

Expand Down Expand Up @@ -155,9 +154,7 @@ export async function translateSpeech(input: AudioSourceParam, options: SpeechTr
modelName,
modelPath,
whisperCppOptions,
));

({ segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(wordTimeline, transcript, targetLanguage, 'single', 'preserve'))
))

break
}
Expand Down Expand Up @@ -194,13 +191,21 @@ export async function translateSpeech(input: AudioSourceParam, options: SpeechTr

// If the audio was cropped before recognition, map the timestamps back to the original audio
if (sourceUncropTimeline && sourceUncropTimeline.length > 0) {
API.convertCroppedToUncroppedTimeline(segmentTimeline, sourceUncropTimeline)

if (wordTimeline) {
API.convertCroppedToUncroppedTimeline(wordTimeline, sourceUncropTimeline)
} else if (segmentTimeline) {
API.convertCroppedToUncroppedTimeline(segmentTimeline, sourceUncropTimeline)
}
}

if (wordTimeline) {
addWordTextOffsetsToTimeline(wordTimeline, transcript)
}

if (!segmentTimeline) {
({ segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(wordTimeline!, transcript, targetLanguage, 'single', 'preserve'))
}

logger.log('')
logger.logDuration(`Total speech translation time`, startTimestamp, chalk.magentaBright)

Expand Down

0 comments on commit e0b3075

Please sign in to comment.