From 5bc87e25cb92eb77d56be168ad8ead313cc01a43 Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Tue, 25 Feb 2025 17:33:15 -0800 Subject: [PATCH] add note on grammar check in chop_tokens --- docs/fast_forward.md | 2 ++ toktrie/src/recognizer.rs | 3 ++- toktrie/src/toktree.rs | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/fast_forward.md b/docs/fast_forward.md index e7224054..8cc5c7f7 100644 --- a/docs/fast_forward.md +++ b/docs/fast_forward.md @@ -129,3 +129,5 @@ examples where `max_token==4` is not enough, we have not bee able to do so. Note, that we can conservatively skip `grammar_allows()` check in the algorithm above, and thus just compute once and for all the set of tokens that are not allowed as the last token in forced bytes. +This drops the proportion of forced tokens in maskbench from `12.7%` to `12.1%`. + diff --git a/toktrie/src/recognizer.rs b/toktrie/src/recognizer.rs index e041945f..de9f95fd 100644 --- a/toktrie/src/recognizer.rs +++ b/toktrie/src/recognizer.rs @@ -59,7 +59,8 @@ impl> Recognizer for StackRecognizer fn trie_finished(&mut self) { // println!("{:?}", &self.stack[0..=self.stack_ptr]); - assert!(self.stack_ptr == 0); + // assert!(self.stack_ptr == 0); + self.stack_ptr = 0; } fn collapse(&mut self) { diff --git a/toktrie/src/toktree.rs b/toktrie/src/toktree.rs index b6cfa6e4..69152305 100644 --- a/toktrie/src/toktree.rs +++ b/toktrie/src/toktree.rs @@ -794,6 +794,10 @@ impl TokTrie { let suff_bytes = self.decode_raw(&tokens[tokens.len().saturating_sub(max_token_lookback)..]); let suff_bytes = &suff_bytes[suff_bytes.len().saturating_sub(self.max_token_len())..]; + // let suff_bytes = self.decode_raw(tokens); + // let suff_bytes = &suff_bytes[suff_bytes.len().saturating_sub(6)..]; + + // let mut anything_goes = StackRecognizer::from(AnythingGoes {}); for idx in 0..suff_bytes.len() { let suff = &suff_bytes[idx..];