Fix error that could occur when the transformer tokenizer vaporizes t…

…he last word in a sentence
stanfordnlp · Dec 14, 2024 · 53081c2 · 53081c2
1 parent 866d8d1
commit 53081c2
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/stanza/models/common/bert_embedding.py b/stanza/models/common/bert_embedding.py
@@ -413,7 +413,14 @@ def convert_to_position_list(sentence, offsets):
         # this uses the last token piece for any offset by overwriting the previous value
         list_offsets[offset+1] = pos
     list_offsets[0] = 0
-    list_offsets[-1] = list_offsets[-2] + 1
+    for offset in list_offsets[-2::-1]:
+        # count backwards in case the last position was
+        # a word or character that got erased by the tokenizer
+        # this loop should eventually find something...
+        # after all, we just set the first one to be 0
+        if offset is not None:
+            list_offsets[-1] = offset + 1
+            break
     return list_offsets
 
 def extract_base_embeddings(model_name, tokenizer, model, data, device, keep_endpoints, num_layers, detach):