Skip to content

Commit

Permalink
Estimation fallback when string exceeds a fixed byte size
Browse files Browse the repository at this point in the history
  • Loading branch information
timothycarambat committed Jan 31, 2025
1 parent 17a8cf8 commit 2f35df6
Show file tree
Hide file tree
Showing 16 changed files with 42 additions and 20 deletions.
2 changes: 1 addition & 1 deletion collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processRawText/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) {
published: METADATA_KEYS.possible.published(metadata),
wordCount: textContent.split(" ").length,
pageContent: textContent,
token_count_estimate: tokenizeString(textContent).length,
token_count_estimate: tokenizeString(textContent),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asAudio.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asEPub.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asMbox.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

item++;
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asOfficeMime.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asTxt.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

const document = writeToServerDocuments(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/Confluence/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ async function loadConfluence(
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};

console.log(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/RepoLoader/GithubRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ async function loadGithubRepo(args, response) {
published: new Date().toLocaleString(),
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent).length,
token_count_estimate: tokenizeString(doc.pageContent),
};
console.log(
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/RepoLoader/GitlabRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async function loadGitlabRepo(args, response) {
}

data.wordCount = pageContent.split(" ").length;
data.token_count_estimate = tokenizeString(pageContent).length;
data.token_count_estimate = tokenizeString(pageContent);
data.pageContent = pageContent;

console.log(
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/WebsiteDepth/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

writeToServerDocuments(data, data.title, outFolderPath);
Expand Down
2 changes: 1 addition & 1 deletion collector/utils/extensions/YoutubeTranscript/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ async function loadYouTubeTranscript({ url }) {
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
token_count_estimate: tokenizeString(content),
};

console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
Expand Down
32 changes: 27 additions & 5 deletions collector/utils/tokenizer/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
const { getEncoding } = require("js-tiktoken");

class TikTokenTokenizer {
static MAX_KB_ESTIMATE = 10;
static DIVISOR = 8;

constructor() {
if (TikTokenTokenizer.instance) {
this.log(
Expand All @@ -18,17 +21,36 @@ class TikTokenTokenizer {
console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
}

/**
* Check if the input is too long to encode
* this is more of a rough estimate and a sanity check to prevent
* CPU issues from encoding too large of strings
* Assumes 1 character = 2 bytes in JS
* @param {string} input
* @returns {boolean}
*/
#isTooLong(input) {
const bytesEstimate = input.length * 2;
const kbEstimate = Math.floor(bytesEstimate / 1024);
return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
}

/**
* Encode a string into tokens for rough token count estimation.
* @param {string} input
* @returns {number[]}
* @returns {number}
*/
tokenizeString(input = "") {
try {
return this.encoder.encode(input);
if (this.#isTooLong(input)) {
this.log("Input will take too long to encode - estimating");
return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
}

return this.encoder.encode(input).length;
} catch (e) {
this.log("Could not tokenize string!", e.message, e.stack);
return [];
this.log("Could not tokenize string! Estimating...", e.message, e.stack);
return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
}
}
}
Expand All @@ -38,7 +60,7 @@ module.exports = {
/**
* Encode a string into tokens for rough token count estimation.
* @param {string} input
* @returns {number[]}
* @returns {number}
*/
tokenizeString: (input) => tokenizer.tokenizeString(input),
};

0 comments on commit 2f35df6

Please sign in to comment.