From 53081c28ba3128fc89ad36919762a54f6cb88f77 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Sat, 14 Dec 2024 00:18:03 -0800 Subject: [PATCH] Fix error that could occur when the transformer tokenizer vaporizes the last word in a sentence --- stanza/models/common/bert_embedding.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/stanza/models/common/bert_embedding.py b/stanza/models/common/bert_embedding.py index 77ade22813..bad9f0ddaa 100644 --- a/stanza/models/common/bert_embedding.py +++ b/stanza/models/common/bert_embedding.py @@ -413,7 +413,14 @@ def convert_to_position_list(sentence, offsets): # this uses the last token piece for any offset by overwriting the previous value list_offsets[offset+1] = pos list_offsets[0] = 0 - list_offsets[-1] = list_offsets[-2] + 1 + for offset in list_offsets[-2::-1]: + # count backwards in case the last position was + # a word or character that got erased by the tokenizer + # this loop should eventually find something... + # after all, we just set the first one to be 0 + if offset is not None: + list_offsets[-1] = offset + 1 + break return list_offsets def extract_base_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach):