Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tokenizer improvements via Singleton class and estimation #3072

Merged
merged 4 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['agent-ui-animations'] # put your current branch to create a build. Core team only.
branches: ['3069-tokenizer-collector-improvements'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
2 changes: 1 addition & 1 deletion collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processRawText/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) {
published: METADATA_KEYS.possible.published(metadata),
wordCount: textContent.split(" ").length,
pageContent: textContent,
token_count_estimate: tokenizeString(textContent).length,
token_count_estimate: tokenizeString(textContent),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asAudio.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asEPub.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asMbox.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

item++;
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asOfficeMime.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asTxt.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/Confluence/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ async function loadConfluence(
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};

console.log(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/RepoLoader/GithubRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ async function loadGithubRepo(args, response) {
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/RepoLoader/GitlabRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async function loadGitlabRepo(args, response) {
}

data.wordCount = pageContent.split(" ").length;
data.token_count_estimate = tokenizeString(pageContent).length;
data.token_count_estimate = tokenizeString(pageContent);
data.pageContent = pageContent;

console.log(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/WebsiteDepth/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

writeToServerDocuments(data, data.title, outFolderPath);
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/YoutubeTranscript/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ async function loadYouTubeTranscript({ url }) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
Expand Down
67 changes: 59 additions & 8 deletions collector/utils/tokenizer/index.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,66 @@
const { getEncoding } = require("js-tiktoken");

function tokenizeString(input = "") {
try {
const encoder = getEncoding("cl100k_base");
return encoder.encode(input);
} catch (e) {
console.error("Could not tokenize string!");
return [];
class TikTokenTokenizer {
static MAX_KB_ESTIMATE = 10;
static DIVISOR = 8;

constructor() {
if (TikTokenTokenizer.instance) {
this.log(
"Singleton instance already exists. Returning existing instance."
);
return TikTokenTokenizer.instance;
}

this.encoder = getEncoding("cl100k_base");
TikTokenTokenizer.instance = this;
this.log("Initialized new TikTokenTokenizer instance.");
}

log(text, ...args) {
console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
}

/**
* Check if the input is too long to encode
* this is more of a rough estimate and a sanity check to prevent
* CPU issues from encoding too large of strings
* Assumes 1 character = 2 bytes in JS
* @param {string} input
* @returns {boolean}
*/
#isTooLong(input) {
const bytesEstimate = input.length * 2;
const kbEstimate = Math.floor(bytesEstimate / 1024);
return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
}

/**
* Encode a string into tokens for rough token count estimation.
* @param {string} input
* @returns {number}
*/
tokenizeString(input = "") {
try {
if (this.#isTooLong(input)) {
this.log("Input will take too long to encode - estimating");
return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
}

return this.encoder.encode(input).length;
} catch (e) {
this.log("Could not tokenize string! Estimating...", e.message, e.stack);
return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
}
}
}

const tokenizer = new TikTokenTokenizer();
module.exports = {
tokenizeString,
/**
* Encode a string into tokens for rough token count estimation.
* @param {string} input
* @returns {number}
*/
tokenizeString: (input) => tokenizer.tokenizeString(input),
};
5 changes: 4 additions & 1 deletion server/utils/AiProviders/deepseek/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ const {
} = require("../../helpers/chat/LLMPerformanceMonitor");
const { v4: uuidv4 } = require("uuid");
const { MODEL_MAP } = require("../modelMap");
const { writeResponseChunk, clientAbortedHandler } = require("../../helpers/chat/responses");
const {
writeResponseChunk,
clientAbortedHandler,
} = require("../../helpers/chat/responses");

class DeepSeekLLM {
constructor(embedder = null, modelPreference = null) {
Expand Down
50 changes: 46 additions & 4 deletions server/utils/helpers/tiktoken.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
const { getEncodingNameForModel, getEncoding } = require("js-tiktoken");

/**
* @class TokenManager
*
* @notice
* We cannot do estimation of tokens here like we do in the collector
* because we need to know the model to do it.
* Other issues are we also do reverse tokenization here for the chat history during cannonballing.
* So here we are stuck doing the actual tokenization and encoding until we figure out what to do with prompt overflows.
*/
class TokenManager {
static instance = null;
static currentModel = null;

constructor(model = "gpt-3.5-turbo") {
if (TokenManager.instance && TokenManager.currentModel === model) {
this.log("Returning existing instance for model:", model);
return TokenManager.instance;
}

this.model = model;
this.encoderName = this.#getEncodingFromModel(model);
this.encoder = getEncoding(this.encoderName);

TokenManager.instance = this;
TokenManager.currentModel = model;
this.log("Initialized new TokenManager instance for model:", model);
return this;
}

log(text, ...args) {
console.log(`\x1b[35m[TokenManager]\x1b[0m ${text}`, ...args);
}

#getEncodingFromModel(model) {
Expand All @@ -15,9 +41,11 @@ class TokenManager {
}
}

// Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized.
// https://github.com/openai/tiktoken/blob/9e79899bc248d5313c7dd73562b5e211d728723d/tiktoken/core.py#L91C20-L91C38
// Returns number[]
/**
* Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized.
* @param {string} input
* @returns {number[]}
*/
tokensFromString(input = "") {
try {
const tokens = this.encoder.encode(String(input), undefined, []);
Expand All @@ -28,17 +56,31 @@ class TokenManager {
}
}

/**
* Converts an array of tokens back to a string.
* @param {number[]} tokens
* @returns {string}
*/
bytesFromTokens(tokens = []) {
const bytes = this.encoder.decode(tokens);
return bytes;
}

// Returns number
/**
* Counts the number of tokens in a string.
* @param {string} input
* @returns {number}
*/
countFromString(input = "") {
const tokens = this.tokensFromString(input);
return tokens.length;
}

/**
* Estimates the number of tokens in a string or array of strings.
* @param {string | string[]} input
* @returns {number}
*/
statsFrom(input) {
if (typeof input === "string") return this.countFromString(input);

Expand Down