Estimation fallback when string exceeds a fixed byte size

Mintplex-Labs · Jan 31, 2025 · 2f35df6 · 2f35df6
1 parent 17a8cf8
commit 2f35df6
Show file tree

Hide file tree

Showing 16 changed files with 42 additions and 20 deletions.
diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
@@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processRawText/index.js b/collector/processRawText/index.js
@@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) {
     published: METADATA_KEYS.possible.published(metadata),
     wordCount: textContent.split(" ").length,
     pageContent: textContent,
-    token_count_estimate: tokenizeString(textContent).length,
+    token_count_estimate: tokenizeString(textContent),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js
@@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asDocx.js b/collector/processSingleFile/convert/asDocx.js
@@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asEPub.js b/collector/processSingleFile/convert/asEPub.js
@@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asMbox.js b/collector/processSingleFile/convert/asMbox.js
@@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
       published: createdDate(fullFilePath),
       wordCount: content.split(" ").length,
       pageContent: content,
-      token_count_estimate: tokenizeString(content).length,
+      token_count_estimate: tokenizeString(content),
     };
 
     item++;

diff --git a/collector/processSingleFile/convert/asOfficeMime.js b/collector/processSingleFile/convert/asOfficeMime.js
@@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
@@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asTxt.js b/collector/processSingleFile/convert/asTxt.js
@@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
     published: createdDate(fullFilePath),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   const document = writeToServerDocuments(

diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js
@@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
           published: createdDate(fullFilePath),
           wordCount: content.split(/\s+/).length,
           pageContent: content,
-          token_count_estimate: tokenizeString(content).length,
+          token_count_estimate: tokenizeString(content),
         };
 
         const document = writeToServerDocuments(

diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js
@@ -104,7 +104,7 @@ async function loadConfluence(
       published: new Date().toLocaleString(),
       wordCount: doc.pageContent.split(" ").length,
       pageContent: doc.pageContent,
-      token_count_estimate: tokenizeString(doc.pageContent).length,
+      token_count_estimate: tokenizeString(doc.pageContent),
     };
 
     console.log(

diff --git a/collector/utils/extensions/RepoLoader/GithubRepo/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/index.js
@@ -66,7 +66,7 @@ async function loadGithubRepo(args, response) {
       published: new Date().toLocaleString(),
       wordCount: doc.pageContent.split(" ").length,
       pageContent: doc.pageContent,
-      token_count_estimate: tokenizeString(doc.pageContent).length,
+      token_count_estimate: tokenizeString(doc.pageContent),
     };
     console.log(
       `[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`

diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -82,7 +82,7 @@ async function loadGitlabRepo(args, response) {
     }
 
     data.wordCount = pageContent.split(" ").length;
-    data.token_count_estimate = tokenizeString(pageContent).length;
+    data.token_count_estimate = tokenizeString(pageContent);
     data.pageContent = pageContent;
 
     console.log(

diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
@@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) {
         published: new Date().toLocaleString(),
         wordCount: content.split(" ").length,
         pageContent: content,
-        token_count_estimate: tokenizeString(content).length,
+        token_count_estimate: tokenizeString(content),
       };
 
       writeToServerDocuments(data, data.title, outFolderPath);

diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js
@@ -115,7 +115,7 @@ async function loadYouTubeTranscript({ url }) {
     published: new Date().toLocaleString(),
     wordCount: content.split(" ").length,
     pageContent: content,
-    token_count_estimate: tokenizeString(content).length,
+    token_count_estimate: tokenizeString(content),
   };
 
   console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);

diff --git a/collector/utils/tokenizer/index.js b/collector/utils/tokenizer/index.js
@@ -1,6 +1,9 @@
 const { getEncoding } = require("js-tiktoken");
 
 class TikTokenTokenizer {
+  static MAX_KB_ESTIMATE = 10;
+  static DIVISOR = 8;
+
   constructor() {
     if (TikTokenTokenizer.instance) {
       this.log(
@@ -18,17 +21,36 @@ class TikTokenTokenizer {
     console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
   }
 
+  /**
+   * Check if the input is too long to encode
+   * this is more of a rough estimate and a sanity check to prevent
+   * CPU issues from encoding too large of strings
+   * Assumes 1 character = 2 bytes in JS
+   * @param {string} input
+   * @returns {boolean}
+   */
+  #isTooLong(input) {
+    const bytesEstimate = input.length * 2;
+    const kbEstimate = Math.floor(bytesEstimate / 1024);
+    return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
+  }
+
   /**
    * Encode a string into tokens for rough token count estimation.
    * @param {string} input
-   * @returns {number[]}
+   * @returns {number}
    */
   tokenizeString(input = "") {
     try {
-      return this.encoder.encode(input);
+      if (this.#isTooLong(input)) {
+        this.log("Input will take too long to encode - estimating");
+        return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
+      }
+
+      return this.encoder.encode(input).length;
     } catch (e) {
-      this.log("Could not tokenize string!", e.message, e.stack);
-      return [];
+      this.log("Could not tokenize string! Estimating...", e.message, e.stack);
+      return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
     }
   }
 }
@@ -38,7 +60,7 @@ module.exports = {
   /**
    * Encode a string into tokens for rough token count estimation.
    * @param {string} input
-   * @returns {number[]}
+   * @returns {number}
    */
   tokenizeString: (input) => tokenizer.tokenizeString(input),
 };