From a11f073eeb7a96a213320208a2329bd16c54519f Mon Sep 17 00:00:00 2001 From: Jue Zhang Date: Sun, 26 Jan 2025 11:10:05 +0800 Subject: [PATCH] support fine-tuned tinybert and mobilebert --- llmlingua/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llmlingua/utils.py b/llmlingua/utils.py index 7b6681c..a08f615 100644 --- a/llmlingua/utils.py +++ b/llmlingua/utils.py @@ -79,7 +79,9 @@ def seed_everything(seed: int): def is_begin_of_new_word(token, model_name, force_tokens, token_map): - if "bert-base-multilingual-cased" in model_name: + if "bert-base-multilingual-cased" in model_name \ + or "tinybert" in model_name.lower() \ + or "mobilebert" in model_name.lower(): if token.lstrip("##") in force_tokens or token.lstrip("##") in set( token_map.values() ): @@ -104,7 +106,9 @@ def replace_added_token(token, token_map): def get_pure_token(token, model_name): - if "bert-base-multilingual-cased" in model_name: + if "bert-base-multilingual-cased" in model_name \ + or "tinybert" in model_name.lower() \ + or "mobilebert" in model_name.lower(): return token.lstrip("##") elif "xlm-roberta-large" in model_name: return token.lstrip("▁")