Skip to content

Commit

Permalink
Merge pull request #209 from hyperaudio/208-client-whisper-improvements
Browse files Browse the repository at this point in the history
208 client whisper improvements
  • Loading branch information
maboa authored Dec 4, 2023
2 parents 3deb95b + b4882ab commit f1578d8
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 36,365 deletions.
20 changes: 9 additions & 11 deletions hyperaudio-client-whisper-template.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,30 @@
<span style="display:block; padding:16px" class="label-text">or</span>
<input id="file-input" name="file" type="file" class="file-input w-full max-w-xs" />
<hr class="my-2 h-0 border border-t-0 border-solid border-neutral-700 opacity-50 dark:border-neutral-200" />

<!--<label for="file-input" class="form-label">Which video/audio file should be transcribed?</label>
<input class="form-control" type="file" id="file-input" accept=".mp3,.wav,.mp4,.mov,.avi,.flv,.wmv,.mpeg,.mpg,.webm,.opus">
<div class="form-text">We only support audio and video files.</div>-->
</div>
<div class="mb-3">
<label for="model-name-input" class="form-label label-text">Which model should be used?</label>
<div>
<select class="form-select select select-bordered w-full max-w-xs" aria-label="Default select example" id="model-name-input">
<option selected="" value="whisper-tiny.en">Whisper (Tiny) English</option>
<option value="whisper-tiny">Whisper (Tiny)</option>
<option value="whisper-base">Whisper (Base) English</option>
<option value="whisper-base">Whisper (Base)</option>
<option value="whisper-small.en">Whisper (Small) English</option>
<option value="whisper-small">Whisper (Small)</option>
<option selected="" value="Xenova/whisper-tiny.en">Whisper (Tiny) English</option>
<option value="Xenova/whisper-tiny">Whisper (Tiny)</option>
<option value="Xenova/whisper-base">Whisper (Base) English</option>
<option value="Xenova/whisper-base">Whisper (Base)</option>
<option value="Xenova/whisper-small.en">Whisper (Small) English</option>
<option value="Xenova/whisper-small">Whisper (Small)</option>
</select>
</div>
<div class="form-text" style="font-size: 90%;">
<p style="padding-top:16px">The models are listed in order of size. The larger the model, the more accurate it is – and slower to process.</p>
<p>The English models are slightly more accurate (for the English language only).</p>
<p>* Whisper running in the browser is currently in beta.</p>
</div>

</div>

<div class="modal-action">
<label id="form-submit-btn" for="transcribe-modal" class="btn btn-primary">TRANSCRIBE</label>
</div>
<!--<button id="form-submit-btn" class="btn btn-primary" disabled="">Submit</button>-->
</form>
</div>
</body>
Expand Down
2 changes: 0 additions & 2 deletions hyperaudio-deepgram-modal.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
<div id="deepgram-modal-template">
<form id="deepgram-form" name="deepgram-form">
<div class="flex flex-col gap-4 w-full">
<!--<label id="close-modal" for="transcribe-modal" class="btn btn-sm btn-circle absolute right-2 top-2">✕</label>
<h3 class="font-bold text-lg">Transcribe</h3>-->
<input id="token" type="text" placeholder="Deepgram token" class="input input-bordered w-full max-w-xs" />
<hr class="my-2 h-0 border border-t-0 border-solid border-neutral-700 opacity-50 dark:border-neutral-200" />
<input id="media" type="text" placeholder="Link to media" class="input input-bordered w-full max-w-xs" />
Expand Down
4 changes: 2 additions & 2 deletions index.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!-- (C) The Hyperaudio Project. AGPL 3.0 @license: https://www.gnu.org/licenses/agpl-3.0.en.html -->
<!-- Hyperaudio Lite Editor - Version 0.3 -->
<!-- Hyperaudio Lite Editor - Version 0.4 -->

<!-- Hyperaudio Lite Editor's source code is provided under a dual license model.
Expand Down Expand Up @@ -219,7 +219,7 @@ <h3 class="text-lg font-bold">Topics</h3>
<h3 class="font-bold text-lg" style="margin-bottom:16px">Transcribe</h3>
<div role="tablist" class="tabs tabs-lifted">

<input type="radio" name="my_tabs_2" role="tab" class="tab" style="width:160px" aria-label="Whisper (Local)" checked />
<input type="radio" name="my_tabs_2" role="tab" class="tab" style="width:160px" aria-label="Whisper (Local) *" checked />
<div role="tabpanel" class="tab-content bg-base-100 border-base-300 rounded-box p-10">
<client-whisper-service></client-whisper-service>
</div>
Expand Down
148 changes: 42 additions & 106 deletions js/hyperaudio-lite-editor-whisper.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*! (C) The Hyperaudio Project. MIT @license: en.wikipedia.org/wiki/MIT_License. */
/*! Version 0.0.4 */
/*! Version 0.0.5 */


class WhisperService extends HTMLElement {

Expand Down Expand Up @@ -50,33 +51,6 @@ function loadWhisperClient(modal) {

const whisperWorkerPath = "./js/whisper.worker.js";

// leave the following three consts as is as they are shared by
// web.worker.js

const MessageTypes = {
DOWNLOADING: "DOWNLOADING",
LOADING: "LOADING",
RESULT: "RESULT",
RESULT_PARTIAL: "RESULT_PARTIAL",
INFERENCE_REQUEST: "INFERENCE_REQUEST",
INFERENCE_DONE: "INFERENCE_DONE"
};

const LoadingStatus = {
SUCCESS: "success",
ERROR: "error",
LOADING: "loading"
};

const ModelNames = {
WHISPER_TINY_EN: "openai/whisper-tiny.en",
WHISPER_TINY: "openai/whisper-tiny",
WHISPER_BASE: "openai/whisper-base",
WHISPER_BASE_EN: "openai/whisper-base.en",
WHISPER_SMALL: "openai/whisper-small",
WHISPER_SMALL_EN: "openai/whisper-small.en"
};

let webWorker = createWorker();

formSubmitBtn.disabled = true;
Expand All @@ -85,77 +59,54 @@ function loadWhisperClient(modal) {
});

function createWorker() {
const worker = new Worker(whisperWorkerPath);
const worker = new Worker(whisperWorkerPath, { type: "module" });

let results = [];
worker.onmessage = (event2) => {
const { type } = event2.data;
if (type === MessageTypes.LOADING) {
handleLoadingMessage(event2.data);
}
if (type === MessageTypes.DOWNLOADING) {
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Downloading model...</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
}
if (type === MessageTypes.RESULT) {
handleResultMessage(event2.data);
results = event2.data.results;
}
if (type === MessageTypes.RESULT_PARTIAL) {

}
if (type === MessageTypes.INFERENCE_DONE) {
handleInferenceDone(results);
}
worker.onmessage = (event) => {
handleInferenceDone(event.data);
};

return worker;
}

function handleLoadingMessage(data) {
const { status } = data;

if (status === LoadingStatus.SUCCESS) {
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Transcribing.... <span id="transcription-progress">0</span>%</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
}
if (status === LoadingStatus.ERROR) {
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Oops! Something went wrong. Please refresh the page and try again.</center><br/><img src="'+errorSvg+'" width="50" alt="error" style="margin: auto; display: block;"></div>';
}
if (status === LoadingStatus.LOADING) {
loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Loading model into memory...</center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
}
}

function handleResultMessage(data) {
const { results, completedUntilTimestamp } = data;
const totalDuration = videoPlayer.duration;
const progress = completedUntilTimestamp / totalDuration * 100;
document.querySelector("#transcription-progress").innerHTML = Math.round(progress);
}

function handleInferenceDone(results) {

console.log(results);

videoPlayer.currentTime = 0;

let hypertranscript = "";
results.forEach((result) => {
let words = result.text.split(' ');
let interval = (result.end - result.start) / words.length;
let timecode = result.start * 1000;
let duration = Math.floor((interval*1000)-1);
words.forEach((word) => {
let start = Math.floor(timecode);
hypertranscript += `<span data-m='${start}' data-d='${duration}'>${word} </span>\n`;
timecode += interval*1000;
});

// new para every 5 sentences
if (result.index % 5 === 0 && result.index !== 0) {
hypertranscript += "\n </p>\n <p>\n";
}
let sentences = 0;
let lastWord = "";

console.log(hypertranscript);
results.output.chunks.forEach((word) => {

// ignore text with square brackets - usually contains things like [BLANK _AUDIO]
if (word.text.indexOf("[") < 0 && word.text.indexOf("]") < 0) {
let start = Math.floor(word.timestamp[0]*1000);
let duration = Math.floor((word.timestamp[1]*1000)-1) - start;
let wordCapitalised = false;

if (Array.from(word.text)[0].toUpperCase() === Array.from(word.text)[0]){
wordCapitalised = true;
}

if (wordCapitalised === true && lastWord.endsWith(".") ){
sentences += 1;
}

lastWord = word.text;

// new para every 5 sentences
if (sentences % 5 === 0 && sentences !== 0) {
hypertranscript += "\n </p>\n <p>\n";
sentences = 0;
}

hypertranscript += `<span data-m='${start}' data-d='${duration}'>${word.text} </span>\n`;
}
});

resultsContainer.innerHTML = "<article>\n <section>\n <p>\n" + hypertranscript + " </p>\n </section>\n</article>\n";

const initEvent = new CustomEvent('hyperaudioInit');
Expand All @@ -166,20 +117,21 @@ function loadWhisperClient(modal) {

async function handleFormSubmission() {

if (!isFileUploaded() || !isModelNameSelected()) {
return;
}

const model_name = `openai/${modelNameSelectionInput.value}`;
const model_name = modelNameSelectionInput.value;
const file = fileUploadBtn.files[0];
const audio = await readAudioFrom(file);

webWorker.postMessage({
type: MessageTypes.INFERENCE_REQUEST,
type: "INFERENCE_REQUEST",
audio,
model_name
});

console.log("web worker");
console.log(webWorker);
videoPlayer.src = URL.createObjectURL(file);

loadingMessageContainer.innerHTML = '<div class="vertically-centre"><center>Transcribing.... </center><br/><img src="'+transcribingSvg+'" width="50" alt="transcribing" style="margin: auto; display: block;"></div>';
}

async function readAudioFrom(file) {
Expand All @@ -190,20 +142,4 @@ function loadWhisperClient(modal) {
const audio = decoded.getChannelData(0);
return audio;
}

function isFileUploaded() {
if (fileUploadBtn.files.length === 0) {
return false;
}
return true;
}

function isModelNameSelected() {
const selectedValue = modelNameSelectionInput.value;
if (modelNameSelectionInput.value === "") {
return false;
}
const modelName = `openai/${selectedValue}`;
return Object.values(ModelNames).indexOf(modelName) !== -1;
}
}
Loading

0 comments on commit f1578d8

Please sign in to comment.