From d1ca16f7f8810eb818a55ab2682c4da3385eebe4 Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Thu, 30 Jan 2025 17:55:03 -0800 Subject: [PATCH] Add tokenizer improvments via Singleton class and estimation (#3072) * Add tokenizer improvments via Singleton class linting * dev build * Estimation fallback when string exceeds a fixed byte size * Add notice to tiktoken on backend --- .github/workflows/dev-build.yaml | 2 +- collector/processLink/convert/generic.js | 2 +- collector/processRawText/index.js | 2 +- .../processSingleFile/convert/asAudio.js | 2 +- collector/processSingleFile/convert/asDocx.js | 2 +- collector/processSingleFile/convert/asEPub.js | 2 +- collector/processSingleFile/convert/asMbox.js | 2 +- .../processSingleFile/convert/asOfficeMime.js | 2 +- .../processSingleFile/convert/asPDF/index.js | 2 +- collector/processSingleFile/convert/asTxt.js | 2 +- collector/processSingleFile/convert/asXlsx.js | 2 +- .../utils/extensions/Confluence/index.js | 2 +- .../extensions/RepoLoader/GithubRepo/index.js | 2 +- .../extensions/RepoLoader/GitlabRepo/index.js | 2 +- .../utils/extensions/WebsiteDepth/index.js | 2 +- .../extensions/YoutubeTranscript/index.js | 2 +- collector/utils/tokenizer/index.js | 67 ++++++++++++++++--- server/utils/AiProviders/deepseek/index.js | 5 +- server/utils/helpers/tiktoken.js | 50 ++++++++++++-- 19 files changed, 125 insertions(+), 29 deletions(-) diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml index bf9a1e67fc..bcd509b5c8 100644 --- a/.github/workflows/dev-build.yaml +++ b/.github/workflows/dev-build.yaml @@ -6,7 +6,7 @@ concurrency: on: push: - branches: ['agent-ui-animations'] # put your current branch to create a build. Core team only. + branches: ['3069-tokenizer-collector-improvements'] # put your current branch to create a build. Core team only. paths-ignore: - '**.md' - 'cloud-deployments/*' diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index a5eb20ca94..4afb9b9548 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) { published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processRawText/index.js b/collector/processRawText/index.js index d435c9e7e0..a29eb63c37 100644 --- a/collector/processRawText/index.js +++ b/collector/processRawText/index.js @@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) { published: METADATA_KEYS.possible.published(metadata), wordCount: textContent.split(" ").length, pageContent: textContent, - token_count_estimate: tokenizeString(textContent).length, + token_count_estimate: tokenizeString(textContent), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index 170426e406..5f033af74a 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js index b0fbd8843e..d33a46b943 100644 --- a/collector/processSingleFile/convert/asDocx.js +++ b/collector/processSingleFile/convert/asDocx.js @@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js index 827e3c3af4..51bb20c809 100644 --- a/collector/processSingleFile/convert/asEPub.js +++ b/collector/processSingleFile/convert/asEPub.js @@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js index 4adde23ec9..48de60fa37 100644 --- a/collector/processSingleFile/convert/asMbox.js +++ b/collector/processSingleFile/convert/asMbox.js @@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; item++; diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js index b6c3c0601f..09e320d168 100644 --- a/collector/processSingleFile/convert/asOfficeMime.js +++ b/collector/processSingleFile/convert/asOfficeMime.js @@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index bf14516419..e3e42d3bd7 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js index 53987f247d..bc95969e14 100644 --- a/collector/processSingleFile/convert/asTxt.js +++ b/collector/processSingleFile/convert/asTxt.js @@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index f21c6f1d9b..ca9b8ebac9 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) { published: createdDate(fullFilePath), wordCount: content.split(/\s+/).length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments( diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index c8ab9b03c3..e0699222a9 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -104,7 +104,7 @@ async function loadConfluence( published: new Date().toLocaleString(), wordCount: doc.pageContent.split(" ").length, pageContent: doc.pageContent, - token_count_estimate: tokenizeString(doc.pageContent).length, + token_count_estimate: tokenizeString(doc.pageContent), }; console.log( diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/index.js index 41147278cd..b2efe22114 100644 --- a/collector/utils/extensions/RepoLoader/GithubRepo/index.js +++ b/collector/utils/extensions/RepoLoader/GithubRepo/index.js @@ -66,7 +66,7 @@ async function loadGithubRepo(args, response) { published: new Date().toLocaleString(), wordCount: doc.pageContent.split(" ").length, pageContent: doc.pageContent, - token_count_estimate: tokenizeString(doc.pageContent).length, + token_count_estimate: tokenizeString(doc.pageContent), }; console.log( `[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}` diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js index f1c528f1d9..cd74fb3164 100644 --- a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js +++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js @@ -82,7 +82,7 @@ async function loadGitlabRepo(args, response) { } data.wordCount = pageContent.split(" ").length; - data.token_count_estimate = tokenizeString(pageContent).length; + data.token_count_estimate = tokenizeString(pageContent); data.pageContent = pageContent; console.log( diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index e680c0233b..4801a45aeb 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) { published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; writeToServerDocuments(data, data.title, outFolderPath); diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js index c7cf7c1f83..0e1e13feb1 100644 --- a/collector/utils/extensions/YoutubeTranscript/index.js +++ b/collector/utils/extensions/YoutubeTranscript/index.js @@ -115,7 +115,7 @@ async function loadYouTubeTranscript({ url }) { published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, - token_count_estimate: tokenizeString(content).length, + token_count_estimate: tokenizeString(content), }; console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`); diff --git a/collector/utils/tokenizer/index.js b/collector/utils/tokenizer/index.js index 618a7cdc7a..2086be2574 100644 --- a/collector/utils/tokenizer/index.js +++ b/collector/utils/tokenizer/index.js @@ -1,15 +1,66 @@ const { getEncoding } = require("js-tiktoken"); -function tokenizeString(input = "") { - try { - const encoder = getEncoding("cl100k_base"); - return encoder.encode(input); - } catch (e) { - console.error("Could not tokenize string!"); - return []; +class TikTokenTokenizer { + static MAX_KB_ESTIMATE = 10; + static DIVISOR = 8; + + constructor() { + if (TikTokenTokenizer.instance) { + this.log( + "Singleton instance already exists. Returning existing instance." + ); + return TikTokenTokenizer.instance; + } + + this.encoder = getEncoding("cl100k_base"); + TikTokenTokenizer.instance = this; + this.log("Initialized new TikTokenTokenizer instance."); + } + + log(text, ...args) { + console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args); + } + + /** + * Check if the input is too long to encode + * this is more of a rough estimate and a sanity check to prevent + * CPU issues from encoding too large of strings + * Assumes 1 character = 2 bytes in JS + * @param {string} input + * @returns {boolean} + */ + #isTooLong(input) { + const bytesEstimate = input.length * 2; + const kbEstimate = Math.floor(bytesEstimate / 1024); + return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE; + } + + /** + * Encode a string into tokens for rough token count estimation. + * @param {string} input + * @returns {number} + */ + tokenizeString(input = "") { + try { + if (this.#isTooLong(input)) { + this.log("Input will take too long to encode - estimating"); + return Math.ceil(input.length / TikTokenTokenizer.DIVISOR); + } + + return this.encoder.encode(input).length; + } catch (e) { + this.log("Could not tokenize string! Estimating...", e.message, e.stack); + return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0; + } } } +const tokenizer = new TikTokenTokenizer(); module.exports = { - tokenizeString, + /** + * Encode a string into tokens for rough token count estimation. + * @param {string} input + * @returns {number} + */ + tokenizeString: (input) => tokenizer.tokenizeString(input), }; diff --git a/server/utils/AiProviders/deepseek/index.js b/server/utils/AiProviders/deepseek/index.js index 694f85501c..b91332a84a 100644 --- a/server/utils/AiProviders/deepseek/index.js +++ b/server/utils/AiProviders/deepseek/index.js @@ -4,7 +4,10 @@ const { } = require("../../helpers/chat/LLMPerformanceMonitor"); const { v4: uuidv4 } = require("uuid"); const { MODEL_MAP } = require("../modelMap"); -const { writeResponseChunk, clientAbortedHandler } = require("../../helpers/chat/responses"); +const { + writeResponseChunk, + clientAbortedHandler, +} = require("../../helpers/chat/responses"); class DeepSeekLLM { constructor(embedder = null, modelPreference = null) { diff --git a/server/utils/helpers/tiktoken.js b/server/utils/helpers/tiktoken.js index a3fa3b6396..394f261874 100644 --- a/server/utils/helpers/tiktoken.js +++ b/server/utils/helpers/tiktoken.js @@ -1,10 +1,36 @@ const { getEncodingNameForModel, getEncoding } = require("js-tiktoken"); +/** + * @class TokenManager + * + * @notice + * We cannot do estimation of tokens here like we do in the collector + * because we need to know the model to do it. + * Other issues are we also do reverse tokenization here for the chat history during cannonballing. + * So here we are stuck doing the actual tokenization and encoding until we figure out what to do with prompt overflows. + */ class TokenManager { + static instance = null; + static currentModel = null; + constructor(model = "gpt-3.5-turbo") { + if (TokenManager.instance && TokenManager.currentModel === model) { + this.log("Returning existing instance for model:", model); + return TokenManager.instance; + } + this.model = model; this.encoderName = this.#getEncodingFromModel(model); this.encoder = getEncoding(this.encoderName); + + TokenManager.instance = this; + TokenManager.currentModel = model; + this.log("Initialized new TokenManager instance for model:", model); + return this; + } + + log(text, ...args) { + console.log(`\x1b[35m[TokenManager]\x1b[0m ${text}`, ...args); } #getEncodingFromModel(model) { @@ -15,9 +41,11 @@ class TokenManager { } } - // Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized. - // https://github.com/openai/tiktoken/blob/9e79899bc248d5313c7dd73562b5e211d728723d/tiktoken/core.py#L91C20-L91C38 - // Returns number[] + /** + * Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized. + * @param {string} input + * @returns {number[]} + */ tokensFromString(input = "") { try { const tokens = this.encoder.encode(String(input), undefined, []); @@ -28,17 +56,31 @@ class TokenManager { } } + /** + * Converts an array of tokens back to a string. + * @param {number[]} tokens + * @returns {string} + */ bytesFromTokens(tokens = []) { const bytes = this.encoder.decode(tokens); return bytes; } - // Returns number + /** + * Counts the number of tokens in a string. + * @param {string} input + * @returns {number} + */ countFromString(input = "") { const tokens = this.tokensFromString(input); return tokens.length; } + /** + * Estimates the number of tokens in a string or array of strings. + * @param {string | string[]} input + * @returns {number} + */ statsFrom(input) { if (typeof input === "string") return this.countFromString(input);