-
-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tokenizer improvments via Singleton class and estimation (#3072)
* Add tokenizer improvments via Singleton class linting * dev build * Estimation fallback when string exceeds a fixed byte size * Add notice to tiktoken on backend
- Loading branch information
1 parent
e1af72d
commit d1ca16f
Showing
19 changed files
with
125 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,66 @@ | ||
const { getEncoding } = require("js-tiktoken"); | ||
|
||
function tokenizeString(input = "") { | ||
try { | ||
const encoder = getEncoding("cl100k_base"); | ||
return encoder.encode(input); | ||
} catch (e) { | ||
console.error("Could not tokenize string!"); | ||
return []; | ||
class TikTokenTokenizer { | ||
static MAX_KB_ESTIMATE = 10; | ||
static DIVISOR = 8; | ||
|
||
constructor() { | ||
if (TikTokenTokenizer.instance) { | ||
this.log( | ||
"Singleton instance already exists. Returning existing instance." | ||
); | ||
return TikTokenTokenizer.instance; | ||
} | ||
|
||
this.encoder = getEncoding("cl100k_base"); | ||
TikTokenTokenizer.instance = this; | ||
this.log("Initialized new TikTokenTokenizer instance."); | ||
} | ||
|
||
log(text, ...args) { | ||
console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args); | ||
} | ||
|
||
/** | ||
* Check if the input is too long to encode | ||
* this is more of a rough estimate and a sanity check to prevent | ||
* CPU issues from encoding too large of strings | ||
* Assumes 1 character = 2 bytes in JS | ||
* @param {string} input | ||
* @returns {boolean} | ||
*/ | ||
#isTooLong(input) { | ||
const bytesEstimate = input.length * 2; | ||
const kbEstimate = Math.floor(bytesEstimate / 1024); | ||
return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE; | ||
} | ||
|
||
/** | ||
* Encode a string into tokens for rough token count estimation. | ||
* @param {string} input | ||
* @returns {number} | ||
*/ | ||
tokenizeString(input = "") { | ||
try { | ||
if (this.#isTooLong(input)) { | ||
this.log("Input will take too long to encode - estimating"); | ||
return Math.ceil(input.length / TikTokenTokenizer.DIVISOR); | ||
} | ||
|
||
return this.encoder.encode(input).length; | ||
} catch (e) { | ||
this.log("Could not tokenize string! Estimating...", e.message, e.stack); | ||
return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0; | ||
} | ||
} | ||
} | ||
|
||
const tokenizer = new TikTokenTokenizer(); | ||
module.exports = { | ||
tokenizeString, | ||
/** | ||
* Encode a string into tokens for rough token count estimation. | ||
* @param {string} input | ||
* @returns {number} | ||
*/ | ||
tokenizeString: (input) => tokenizer.tokenizeString(input), | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters