diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs index b8048f4..45f69d5 100644 --- a/parser/src/earley/parser.rs +++ b/parser/src/earley/parser.rs @@ -2701,10 +2701,10 @@ impl Parser { } pub(crate) fn additional_backtrack(&mut self, n_bytes: usize) { - assert!(self.state.byte_to_token_idx.len() >= n_bytes); - self.state - .byte_to_token_idx - .truncate(self.state.byte_to_token_idx.len() - n_bytes); + // we can be sometimes asked to backtrack more than we have + // in case the prompt was token-healed; see https://github.com/guidance-ai/guidance/issues/1131 + let new_len = self.state.byte_to_token_idx.len().saturating_sub(n_bytes); + self.state.byte_to_token_idx.truncate(new_len); } pub fn apply_token(&mut self, tok_bytes: &[u8]) -> Result { diff --git a/parser/src/tokenparser.rs b/parser/src/tokenparser.rs index f89bebd..49ad2e3 100644 --- a/parser/src/tokenparser.rs +++ b/parser/src/tokenparser.rs @@ -513,13 +513,16 @@ impl TokenParser { backtrack_tokens += 1; } assert!(backtrack_tokens > 0); + let additional_backtrack_bytes: usize = (-backtrack_bytes).try_into().unwrap(); + let full_backtrack_bytes = backtrack_bytes0 + additional_backtrack_bytes; - let byte_ptr = self.llm_bytes.len() - backtrack_bytes0; + let byte_ptr = self.llm_bytes.len() - full_backtrack_bytes; infoln!( self, - "backtrack: {} tokens / {} bytes (deletes: {:?})", + "backtrack: {} tokens / {}+{} bytes (deletes: {:?})", backtrack_tokens, backtrack_bytes0, + additional_backtrack_bytes, String::from_utf8_lossy(&self.llm_bytes[byte_ptr..]) ); self.llm_bytes.truncate(byte_ptr); @@ -536,8 +539,7 @@ impl TokenParser { } else { // make sure the parser know we actually don't have // the non-backtracked bytes of backtracked token - self.parser - .additional_backtrack((-backtrack_bytes).try_into().unwrap()); + self.parser.additional_backtrack(additional_backtrack_bytes); } self.llm_tokens.truncate(token_ptr); return Ok(backtrack_tokens); diff --git a/sample_parser/tests/test_ll.rs b/sample_parser/tests/test_ll.rs index 18bc698..b6c4b06 100644 --- a/sample_parser/tests/test_ll.rs +++ b/sample_parser/tests/test_ll.rs @@ -1,3 +1,9 @@ +// syntax: +// token separator: ‧ +// token disallowed: ✖ +// backtrack: 1↶ (one token) +// end of string: ≺EOS≻ + use sample_parser::*; use serde_json::json; @@ -406,6 +412,19 @@ fn test_ll_backtrack_stop() { ); } +#[test] +fn test_ll_stop_heal() { + // https://github.com/guidance-ai/guidance/issues/1131 + check_lark_grammar_prompt( + r#" + start: gen "foo" + gen[stop=/"/]: /.*/ + "#, + "Hello, text: ", + &["Hello‧,‧ text‧:", " \"", "1↶ foo"], + ); +} + #[test] fn test_llparser() { check_lark_grammar_prompt(